Aim of this analysis is to identiy processing sites (PSS) which are dependent on RNase E 5’ sensing by comparing PSS detected in two strains: rne(5p) and rne(WT). Also, PSS affected by overexpressing RNase E and RNase HII will be identified by comparing rne(WT) with wild-type Synechocystis (WT). The respective sets of TSS will be further analysed, e.g. for secondary structures. rne(WT) will partially be reffered to as “dWT”, rne(5p) as “dTV” or “TV”.
#import gff for anno-TSS
anno_TSS <- unique(rtracklayer::import("input/Kopf_TSS.gff")) # 14 int-TSS are duplicates and are also annotated as alt-TSS
features <- rtracklayer::import("input/20210217_syne_onlyUnique_withFeat.gff3")
TUs <- rtracklayer::import("input/Kopf_4091_TUs_combined.gff3")
# import PSS and TSS identified in TIER-seq analysis
TIERseq_PSS_TSS <- rtracklayer::import("input/TSS_PSS_rneAnalysis_all.gff")
Input are tables of PSS and TSS 5’ end counts prepared using a workflow on usegalaxy.eu and a python script. Sequencing data was created from libraries prepared similar to the protocol described in Innocenti et al. (2015) (tagRNA-Seq).
PSS_raw <- read.delim("input/PSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
names(PSS_raw) <- paste(names(PSS_raw), "_PSS", sep="")
TSS_raw <- read.delim("input/TSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
names(TSS_raw) <- paste(names(TSS_raw), "_TSS", sep="")
# merge PSS and TSS to merged_raw
merged_raw <- merge(PSS_raw, TSS_raw, by="row.names")
row.names(merged_raw) <- merged_raw$Row.names
merged_raw$Row.names <- NULL
rm(PSS_raw)
rm(TSS_raw)
When taking into account lowly populated sites, DESeq2 dispersion estimates become over-fitted. Hence, edgeR::filterByExpression() is used as a pre-processing step before further DESeq2 analyses.
group=c(rep("dWT_PSS", 3), rep("WT_PSS", 3), rep("TV_PSS", 2), rep("dWT_TSS", 3), rep("WT_TSS", 3), rep("TV_TSS", 2))
y <- DGEList(counts=merged_raw, group=group)
nrow(y)
## [1] 7894038
keep <- filterByExpr(y)
y <- y[keep, ,keep.lib.size=FALSE] #reduces from 7,894,038 positions to 15,721
nrow(y)
## [1] 15721
merged_filtered <- merged_raw[row.names(y$counts),]
nrow(merged_filtered)
## [1] 15721
rm(merged_raw)
Since TSS libraries are generally larger than PSS libraries, size factors are determined separately for the different data set types. Size factors for TSS and PSS correlate quite well for different samples.
coldata_PSS <- read.csv("input/colData_PSS.csv", row.names=1)
coldata_TSS <- read.csv("input/colData_TSS.csv", row.names=1)
# create DESeq2 data object
ddsMat_PSS <- DESeqDataSetFromMatrix(countData = merged_filtered[,1:8],
colData = coldata_PSS,
design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_TSS <- DESeqDataSetFromMatrix(countData = merged_filtered[,9:16],
colData = coldata_TSS,
design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# run DESeq
ddsMat_PSS <- estimateSizeFactors(ddsMat_PSS)
ddsMat_TSS <- estimateSizeFactors(ddsMat_TSS)
# factors needed for creation of .grp file for multireads data
write.csv(data.frame(factor=ddsMat_PSS$sizeFactor), file="output/PSS_sizeFactors.csv")
write.csv(data.frame(factor=ddsMat_TSS$sizeFactor), file="output/TSS_sizeFactors.csv")
sizeFact <- c(ddsMat_PSS$sizeFactor, ddsMat_TSS$sizeFactor)
plot(sizeFact[1:8], sizeFact[9:16], xlab="Size Factors PSS", ylab="Size Factors TSS")
If .grp were not yet created, create .grp files by setting if() to TRUE (! takes some time !):
if(FALSE){
PSS_raw <- read.delim("input/PSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
names(PSS_raw) <- paste(names(PSS_raw), "_PSS", sep="")
ddsMat_PSS_2 <- DESeqDataSetFromMatrix(countData = PSS_raw,
colData = coldata_PSS,
design = ~ strain)
ddsMat_PSS_2$sizeFactor <- ddsMat_PSS$sizeFactor
PSS_norm_counts <- counts(ddsMat_PSS_2, normalized=TRUE)
rm(ddsMat_PSS_2)
grp_File <- data.frame("WT"=apply(PSS_norm_counts[,c("WT1_PSS", "WT2_PSS", "WT3_PSS")],1,mean), "dWT"=apply(PSS_norm_counts[,c("dWT1_PSS", "dWT2_PSS", "dWT3_PSS")],1,mean), "TV"=apply(PSS_norm_counts[,c("TV1_PSS", "TV2_PSS")],1,mean))
grp_File$seqname <- rep(NA, length(grp_File$WT))
seqnames <- c("BA000022.2", "AP004310.1", "AP004311.1", "AP004312.1", "AP006585.1")
for(i in seqnames){
grp_File[which(grepl(i, row.names(grp_File))),]$seqname <- i
}
grp_File$strand <- rep(NA, length(grp_File$WT))
for(i in c("plus", "minus")){
grp_File[which(grepl(i, row.names(grp_File))),]$strand <- i
}
for(i in seqnames){
for(j in c("plus", "minus")){
tmp <- subset(grp_File, grp_File$seqname==i & grp_File$strand==j)[,c("WT", "dWT", "TV")]
s=""
if(j=="plus"){
s="fw"
}else{
s="rev"
}
write.table(tmp, file=paste("output/grp/PSS/", i, "_PSS_WT-dWT-TV_", s, ".grp", sep=""), sep="\t", row.names=FALSE, col.names = FALSE)
}
}
}
if(FALSE){
TSS_raw <- read.delim("input/TSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
names(TSS_raw) <- paste(names(TSS_raw), "_TSS", sep="")
ddsMat_TSS_2 <- DESeqDataSetFromMatrix(countData = TSS_raw,
colData = coldata_TSS,
design = ~ strain)
ddsMat_TSS_2$sizeFactor <- ddsMat_TSS$sizeFactor
TSS_norm_counts <- counts(ddsMat_TSS_2, normalized=TRUE)
rm(ddsMat_TSS_2)
grp_File <- data.frame("WT"=apply(TSS_norm_counts[,c("WT1_TSS", "WT2_TSS", "WT3_TSS")],1,mean), "dWT"=apply(TSS_norm_counts[,c("dWT1_TSS", "dWT2_TSS", "dWT3_TSS")],1,mean), "TV"=apply(TSS_norm_counts[,c("TV1_TSS", "TV2_TSS")],1,mean))
grp_File$seqname <- rep(NA, length(grp_File$WT))
seqnames <- c("BA000022.2", "AP004310.1", "AP004311.1", "AP004312.1", "AP006585.1")
for(i in seqnames){
grp_File[which(grepl(i, row.names(grp_File))),]$seqname <- i
}
grp_File$strand <- rep(NA, length(grp_File$WT))
for(i in c("plus", "minus")){
grp_File[which(grepl(i, row.names(grp_File))),]$strand <- i
}
for(i in seqnames){
for(j in c("plus", "minus")){
tmp <- subset(grp_File, grp_File$seqname==i & grp_File$strand==j)[,c("WT", "dWT", "TV")]
s=""
if(j=="plus"){
s="fw"
}else{
s="rev"
}
write.table(tmp, file=paste("output/grp/TSS/", i, "_TSS_WT-dWT-TV_", s, ".grp", sep=""), sep="\t", row.names=FALSE, col.names = FALSE)
}
}
}
DESeq2 is used to determine which positions are enriched in the TSS and PSS libraries, respectively. Size factors determined above are used.
coldata <- read.csv("input/colData_PSS-TSS.csv", row.names=1)
# create DESeq2 data object
ddsMat_PSSTSS <- DESeqDataSetFromMatrix(countData = merged_filtered,
colData = coldata,
design = ~ strain + type)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_PSSTSS$sizeFactor <- sizeFact
ddsMat_PSSTSS <- DESeq(ddsMat_PSSTSS)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_PSSTSS <- results(ddsMat_PSSTSS, contrast=c("type", "PSS", "TSS"))
write.csv(res_PSSTSS[order(res_PSSTSS$padj),], file="output/DESeq2_resultsTables/results_PSS-TSS-copmarisons.csv")
As a next stop, a first unfiltered set of potential PSS and TSS positions is created:
PSS_positions_unfiltered <- row.names(subset(res_PSSTSS, res_PSSTSS$log2FoldChange>0.8 & res_PSSTSS$padj<0.05))
TSS_positions_unfiltered <- row.names(subset(res_PSSTSS, res_PSSTSS$log2FoldChange<(-0.8) & res_PSSTSS$padj<0.05))
Create GRanges object for PSS data:
PSS_nonReduced_Ranges <- create_GRanges_object(base::strsplit(PSS_positions_unfiltered, "-"), res_PSSTSS[PSS_positions_unfiltered,]$baseMean)
PSS_nonReduced_Ranges$name <- PSS_positions_unfiltered
PSS_nonReduced_Ranges$type <- "PSS"
pdf(file="output/DESeq2_Plots/ddsMat_PSSTSS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_PSSTSS, xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png
## 2
plotDispEsts(ddsMat_PSSTSS)
PSS and TSS data sets are separated on principal component 1.
p <- PCA_plot_PSSTSS(ddsMat_PSSTSS, "PSS-TSS")
p
ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_PCA.pdf", plot=p, width=15, height=9, units="cm")
p <- heatmap_plot_PSSTSS(ddsMat_PSSTSS, "PSS-TSS")
p
ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_heatMap.pdf", plot=p, width=15, height=12, units="cm")
Comparing PSS and TSS using DESeq2 yields a bimodal log2foldchange distribution. This underlines that PSS and TSS are quite well discernible using the chosen approach. This log2FC distribution is also observable in the non-normalized count data. This is further corroborated by the MAplot and also regarding p-values etc. (see below: Volcano plot and p-value-plot).
pdf(file="output/DESeq2_Plots/ddsMat_PSSTSS_hist_log2FC.pdf", width=4.5, height=4.5)
hist(res_PSSTSS$log2FoldChange, breaks=20, main="", xlab="Log2FC(PSS/TSS)", ylab="Frequency")
dev.off()
## png
## 2
hist(res_PSSTSS$log2FoldChange, breaks=20, main="DESeq2 Normalization", xlab="Log2FC PSS/TSS", ylab="Frequency")
# non-normalized
log2_all <- log2(apply(merged_filtered[,1:8],1, mean)/apply(merged_filtered[,9:16],1,mean))
hist(log2_all, breaks=20, main="No normalization", xlab="Log2(PSS/TSS)")
p <- MAplot_ggplot(res_PSSTSS, foldchange=0.8, y_axis_label = "Log2 fold-change(PSS/TSS)")
p <- p + scale_colour_manual(values=c("UP"="#4e9879ff", "DOWN"="#9b511fff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_MAplot.pdf",plot=p, width=15, height=12, units="cm")
res_PSSTSS <- subset(res_PSSTSS, !is.na(res_PSSTSS$padj))
# down: TSS, up: PSS
count_up_down(res_PSSTSS, foldchange=0.8, padjusted=0.05)
## [1] "number of features down: 8841"
## [1] "number of features up: 4632"
p <- volcanoPlot_ggplot(res_PSSTSS, foldchange=0.8, padjusted=0.05) + scale_colour_manual(values=c("UP"="#4e9879ff", "DOWN"="#9b511fff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
pdf(file="output/DESeq2_Plots/ddsMat_PSSTSS_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(res_PSSTSS, "Comparison PSS to TSS")
dev.off()
## png
## 2
pvaluePlot(res_PSSTSS, "Comparison PSS to TSS")
By inspecting the identified TSS, we observed that a high number of false-positive TSS positions are present in highly expressed genes. To control for this, we control for TSS with a ratio (number TSS-reads)/(number transcript reads) above a certain threshold. To determine this cut-off, we manually inspected highly expressed genomic positions such as the 5’ UTR of psbA2, ncr0700, ssrA and CRISPR3. For instance for psbA2, the actual TSS are well established by independent experiments (Sakurai et al. (2012)). 2% seemed to be a good cut-off for this control.
Further, we introduced a cut-off to filter against TSS without associated transcription. The used library preparation protocol does not capture sRNAs with a length below 200nt and TSS without accompanying transcription might be TSS of sRNAs. However, we assume a high percentage of TSS which were identified without associated transcription are false positives, e.g. created by reverse transcription artefacts. An example is the second TSS upstream of PsbA2R. The actual TSS of the transcript was determined in Sakurai et al. (2012). We decided to filter out TSS for which the ratio (number TSS-reads)/(number transcript coverage) > 200%.
First, transcript coverage data has to be read in and normalized:
transcript_coverage <- read.delim("input/transcript_coverage_combined_5sensing.txt", header=TRUE, row.names=1)
group=c(rep("dWT", 3), rep("WT", 3), rep("TV", 2))
y <- DGEList(counts=transcript_coverage, group=group)
nrow(y)
## [1] 7894038
keep <- filterByExpr(y)
y <- y[keep, ,keep.lib.size=FALSE] #reduces from 7,894,038 positions to 4,017,775
nrow(y)
## [1] 4017775
transcript_coverage_filt <- transcript_coverage[row.names(y$counts),]
rm(transcript_coverage) # remove to make space
coldata <- read.csv("input/colData_transcript.csv", row.names=1)
# create DESeq2 data object
ddsMat_transc <- DESeqDataSetFromMatrix(countData = transcript_coverage_filt,
colData = coldata,
design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_transc <- estimateSizeFactors(ddsMat_transc)
trans_norm <- as.data.frame(counts(ddsMat_transc, normalized=TRUE))
TSS_norm <- as.data.frame(counts(ddsMat_TSS, normalized=TRUE))
Do actual filtering and select TSS which are located within a distance of 20nt of annotated TSS by Kopf et al. (2014). Save those TSS combined with the PSS determined as part of this analysis as .gff
TSS_trans_ratio_cutoff <- 0.02
upper_TSS_trans_ratio_cutoff <- 2.00
TSS_trans_ratio <- apply(TSS_norm[TSS_positions_unfiltered,],1,mean)/apply(trans_norm[TSS_positions_unfiltered,],1,mean)
TSS_above_cutoff <- subset(TSS_positions_unfiltered, TSS_trans_ratio[TSS_positions_unfiltered]>TSS_trans_ratio_cutoff & TSS_trans_ratio[TSS_positions_unfiltered]<upper_TSS_trans_ratio_cutoff)
TSS_nonReduced_Ranges <- create_GRanges_object(base::strsplit(TSS_above_cutoff, "-"), res_PSSTSS[TSS_above_cutoff,]$baseMean)
TSS_nonReduced_Ranges$name <- TSS_above_cutoff
highly_prob_TSS <- TSS_nonReduced_Ranges[which(countOverlaps(TSS_nonReduced_Ranges, anno_TSS, maxgap=20)>0)]
names_prob_TSS <- highly_prob_TSS$name
length(reduce(highly_prob_TSS))
## [1] 2863
highly_prob_TSS$type <- "TSS"
save_gff(c(highly_prob_TSS,PSS_nonReduced_Ranges), "output/annoTSS_comparison/gffs/only5sensing/", "TSS_PSS_only5sensing")
TSS_TIERseq <- subset(TIERseq_PSS_TSS, TIERseq_PSS_TSS$type=="TSS")
PSS_TIERseq <- subset(TIERseq_PSS_TSS, TIERseq_PSS_TSS$type=="PSS")
PSS_combined <- unique(c(PSS_nonReduced_Ranges, PSS_TIERseq))
length(PSS_combined)
## [1] 6490
length(PSS_combined)/length(c(PSS_TIERseq,PSS_nonReduced_Ranges))
## [1] 0.8030191
PSS_combined$strand_ident <- NA
PSS_combined[strand(PSS_combined)=="+",]$strand_ident <- "plus"
PSS_combined[strand(PSS_combined)=="-",]$strand_ident <- "minus"
PSS_positions_combined <- paste(seqnames(PSS_combined), start(PSS_combined), PSS_combined$strand_ident, sep="-")
TSS_combined <- unique(c(highly_prob_TSS, TSS_TIERseq))
length(TSS_combined)
## [1] 4440
length(TSS_combined)/length(c(highly_prob_TSS, TSS_TIERseq))
## [1] 0.6357388
TSS_combined$strand_ident <- NA
TSS_combined[strand(TSS_combined)=="+",]$strand_ident <- "plus"
TSS_combined[strand(TSS_combined)=="-",]$strand_ident <- "minus"
TSS_positions_combined <- paste(seqnames(TSS_combined), start(TSS_combined), TSS_combined$strand_ident, sep="-")
TSS_combined$type <- "TSS"
PSS_combined$type <- "PSS"
save_gff(c(TSS_combined, PSS_combined), "output/annoTSS_comparison/gffs/combined/", "TSS_PSS_combined")
PSS_positions_Ranges <- GenomicRanges::reduce(PSS_combined)
length(PSS_positions_Ranges)
## [1] 5429
TSS_positions_Ranges <- GenomicRanges::reduce(TSS_combined)
length(TSS_positions_Ranges)
## [1] 3511
p <- ggplot(data=as.data.frame(PSS_positions_Ranges), aes(x=width)) + geom_histogram(fill="darkgray", binwidth=1, color="darkgray") + theme_light() + xlab("Width of reported PSS") + ylab("Number of cases") + scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10))
p
ggsave("output/annoTSS_comparison/hist_width_reported-PSS.pdf", plot=p, width=9, height=9, units="cm")
p <- ggplot(data=as.data.frame(TSS_positions_Ranges), aes(x=width)) + geom_histogram(fill="darkgray", binwidth=1, color="darkgray") + theme_light() + xlab("Width of reported TSS") + ylab("Number of cases") + scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10), limits=c(0,10))
p
## Warning: Removed 2 rows containing missing values (geom_bar).
ggsave("output/annoTSS_comparison/hist_width_reported-TSS.pdf", plot=p, width=9, height=9, units="cm")
## Warning: Removed 2 rows containing missing values (geom_bar).
# prepare data.frame to count numbers of TSS, PSS overlapping with anno-TSS
df_overlap_TSSPSS <- as.data.frame(cbind(type=c(rep("PSS", 4), rep("TSS",4)), TSS=c(rep(c("start", "alt", "int", "i-start"), 2))))
df_overlap_TSSPSS$number <- NA
## count TSS, PSS overlapping with certain anno-TSS
# overlap of start anno_TSS _without_ TU_type: i with PSS, TSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="start",3] <- sum(countOverlaps(PSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & !grepl("i", anno_TSS$TU_type, fixed = TRUE))))
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS" & df_overlap_TSSPSS$TSS=="start",3] <- sum(countOverlaps(TSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & !grepl("i", anno_TSS$TU_type, fixed = TRUE))))
# overlap of start anno_TSS _with only_ TU_type: i with PSS, TSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="i-start",3] <- sum(countOverlaps(PSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & grepl("i", anno_TSS$TU_type, fixed = TRUE))))
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS" & df_overlap_TSSPSS$TSS=="i-start",3] <- sum(countOverlaps(TSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & grepl("i", anno_TSS$TU_type, fixed = TRUE))))
# overlap of alt anno_TSS with PSS, TSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="alt",3] <- sum(countOverlaps(PSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="alt")))
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS" & df_overlap_TSSPSS$TSS=="alt",3] <- sum(countOverlaps(TSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="alt")))
# overlap of int anno_TSS with PSS, TSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="int",3] <- sum(countOverlaps(PSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="int")))
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS" & df_overlap_TSSPSS$TSS=="int",3] <- sum(countOverlaps(TSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="int")))
#control
sum((countOverlaps(PSS_positions_Ranges, anno_TSS)>0))
## [1] 55
sum(df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS",]$number)
## [1] 55
sum((countOverlaps(TSS_positions_Ranges,anno_TSS)>0))
## [1] 2485
sum((countOverlaps(anno_TSS,TSS_positions_Ranges)>0)) # problem: anno_TSS are not unique - some alt and int TSS overlap.
## [1] 2518
sum(df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS",]$number)
## [1] 2518
The majority of anno-TSS which overlap with PSS are not canonical start TSS, but actual int or alt TSS.
p <- ggplot(data=df_overlap_TSSPSS, aes(x=type, y=number, fill=TSS)) + geom_bar(position="fill", stat="identity") + theme_light()+ xlab("") + ylab("Percentage") + scale_fill_grey(start = .2, end = .9)
p
ggsave(filename = "output/annoTSS_comparison/barplot_TSS-PSS-KopfTSS.pdf", plot = p, width = 7, height = 7, units = "cm")
write.csv(as.data.frame(PSS_positions_Ranges[which(countOverlaps(PSS_positions_Ranges, anno_TSS)>0)]), file="output/annoTSS_comparison/PSS_overlapping_annoTSS.csv")
Calculate how many PSS overlap with annotated regions:
sum(countOverlaps(PSS_positions_Ranges,TUs)>0|countOverlaps(PSS_positions_Ranges,features)>0)/length(PSS_positions_Ranges)
## [1] 0.9845275
Calculate number and percentage of TUs and CDS which are overlapped by captured PSS:
sum(countOverlaps(TUs,PSS_positions_Ranges)>0)
## [1] 1061
sum(countOverlaps(TUs,PSS_positions_Ranges)>0)/length(TUs)
## [1] 0.2593498
sum(countOverlaps(features,PSS_positions_Ranges)>0)
## [1] 1321
sum(countOverlaps(features,PSS_positions_Ranges)>0)/length(features)
## [1] 0.215357
syne_fasta <- readDNAStringSet("input/Synechocystis.fasta", format="fasta")
The following chunk of code filters TSS and PSS positions, so that the resulting GRanges objects only encompass TSS/PSS positions which are the most populated in a TSS/PSS region (a stretch of directly adjacent TSS/PSS).
TSS_advReduce_Ranges <- advanced_reduce(TSS_nonReduced_Ranges)
PSS_advReduce_Ranges <- advanced_reduce(PSS_nonReduced_Ranges)
Actually interesting stuff is only happening in region -20 nt upstream: Hence, only extract this region.
TSS_DNA_sequences_40nt <- extract_DNA_sequences(TSS_advReduce_Ranges, syne_fasta, len=20, only_upstream=TRUE)
PSS_DNA_sequences <- extract_DNA_sequences(PSS_advReduce_Ranges, syne_fasta, len=10, only_upstream=FALSE)
writeXStringSet(TSS_DNA_sequences_40nt, "output/annoTSS_comparison/weblogos/allTSS_20nt_forWeblogo.fasta", append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(PSS_DNA_sequences, "output/annoTSS_comparison/weblogos/PSS_10nt_forWeblogo_upAndDownstream.fasta", append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
run createWeblogos_10-20nt_DNA.txt for creation of sequence logos
PSS_raw <- read.delim("input/PSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
PSS_use <- PSS_raw[PSS_positions_combined,]
TSS_raw <- read.delim("input/TSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
TSS_use <- TSS_raw[TSS_positions_combined,]
rm(PSS_raw)
rm(TSS_raw)
coldata <- read.csv("input/colData.csv", row.names=1)
ddsMat_PSS <- DESeqDataSetFromMatrix(countData = PSS_use,
colData = coldata,
design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_TSS <- DESeqDataSetFromMatrix(countData = TSS_use,
colData = coldata,
design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_PSS <- DESeq(ddsMat_PSS)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsMat_TSS <- DESeq(ddsMat_TSS)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
write.csv(data.frame(counts(ddsMat_PSS, normalized=TRUE)), file="output/PSS_normalizedCounts.csv")
write.csv(data.frame(counts(ddsMat_TSS, normalized=TRUE)), file="output/TSS_normalizedCounts.csv")
pdf(file="output/DESeq2_Plots/ddsMat_PSS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_PSS, main="PSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png
## 2
pdf(file="output/DESeq2_Plots/ddsMat_TSS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_TSS, main="TSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png
## 2
plotDispEsts(ddsMat_PSS, main="PSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
plotDispEsts(ddsMat_TSS, main="TSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
p <- PCA_plot(ddsMat_PSS, "PSS")
p
ggsave("output/DESeq2_Plots/ddsMat_PSS_PCA.pdf", plot=p, width=9, height=9, units="cm")
p <- PCA_plot(ddsMat_TSS, "TSS")
p
ggsave("output/DESeq2_Plots/ddsMat_TSS_PCA.pdf", plot=p, width=9, height=9, units="cm")
p <- heatmap_plot(ddsMat_PSS, "PSS")
p
ggsave("output/DESeq2_Plots/ddsMat_PSS_heatMap.pdf", plot=p, width=15, height=12, units="cm")
p <- heatmap_plot(ddsMat_TSS, "TSS")
p
ggsave("output/DESeq2_Plots/ddsMat_TSS_heatMap.pdf", plot=p, width=15, height=12, units="cm")
# extract results
PSS_result_dWT_WT <- results(ddsMat_PSS, contrast=c("strain", "dWT", "WT")) # dWT/WT -> higher in dWT: higher log2FC
write.csv(PSS_result_dWT_WT[order(PSS_result_dWT_WT$padj),], file="output/DESeq2_resultsTables/results_PSS-dWT-WT.csv")
PSS_result_dWT_TV <- results(ddsMat_PSS, contrast=c("strain", "dWT", "TV")) # dWT/TV -> higher in dWT: higher log2FC
write.csv(PSS_result_dWT_TV[order(PSS_result_dWT_TV$padj),], file="output/DESeq2_resultsTables/results_PSS-dWT-TV.csv")
PSS_result_WT_TV <- results(ddsMat_PSS, contrast=c("strain", "WT", "TV")) # WT/TV -> higher in WT: higher log2FC
write.csv(PSS_result_WT_TV[order(PSS_result_WT_TV$padj),], file="output/DESeq2_resultsTables/results_PSS-WT-TV.csv")
TSS_result_dWT_WT <- results(ddsMat_TSS, contrast=c("strain", "dWT", "WT")) # dWT/WT -> higher in dWT: higher log2FC
write.csv(TSS_result_dWT_WT[order(TSS_result_dWT_WT$padj),], file="output/DESeq2_resultsTables/results_TSS-dWT-WT.csv")
TSS_result_dWT_TV <- results(ddsMat_TSS, contrast=c("strain", "dWT", "TV")) # dWT/TV -> higher in dWT: higher log2FC
write.csv(TSS_result_dWT_TV[order(TSS_result_dWT_TV$padj),], file="output/DESeq2_resultsTables/results_TSS-dWT-TV.csv")
TSS_result_WT_TV <- results(ddsMat_TSS, contrast=c("strain", "WT", "TV")) # WT/TV -> higher in WT: higher log2FC
write.csv(TSS_result_WT_TV[order(TSS_result_WT_TV$padj),], file="output/DESeq2_resultsTables/results_TSS-WT-TV.csv")
Annotate peak regions
indices_plus <- which(strand(PSS_nonReduced_Ranges)=="+")
indices_minus <- which(strand(PSS_nonReduced_Ranges)=="-")
PSS_names <- c()
PSS_names[indices_plus] <- paste(seqnames(PSS_nonReduced_Ranges[indices_plus]),start(PSS_nonReduced_Ranges[indices_plus]),"plus",sep="-")
PSS_names[indices_minus] <- paste(seqnames(PSS_nonReduced_Ranges[indices_minus]),start(PSS_nonReduced_Ranges[indices_minus]),"minus",sep="-")
rm(indices_plus)
rm(indices_minus)
PSS_nonReduced_Ranges$names <- PSS_names
PSS_nonReduced_Ranges$featuresOverlap <- rep("", length(PSS_nonReduced_Ranges))
PSS_nonReduced_Ranges$TU_overlap <- rep("", length(PSS_nonReduced_Ranges))
for(i in 1:length(PSS_nonReduced_Ranges)){
PSS_nonReduced_Ranges$featuresOverlap[i] <- paste(features[which(countOverlaps(features,PSS_nonReduced_Ranges[i])>0)]$locus_tag,collapse=",")
PSS_nonReduced_Ranges$TU_overlap[i] <- paste(TUs[which(countOverlaps(TUs,PSS_nonReduced_Ranges[i])>0)]$index,collapse=",")
}
# create tables with annotation: which features and TUs are overlapping with PSS?
df_PSS_nonRed_Ranges <- as.data.frame(PSS_nonReduced_Ranges)
row.names(df_PSS_nonRed_Ranges) <- df_PSS_nonRed_Ranges$names
# PSS result dWT WT
PSS_result_dWT_WT_annot <- rownames_to_column(as.data.frame(PSS_result_dWT_WT[order(PSS_result_dWT_WT$padj),]))
PSS_result_dWT_WT_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_dWT_WT_annot$rowname,]$featuresOverlap
PSS_result_dWT_WT_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_dWT_WT_annot$rowname,]$TU_overlap
PSS_result_dWT_WT_annot <- PSS_result_dWT_WT_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_dWT_WT_annot, "output/DESeq2_resultsTables/results_dWT_WT_annotated.tsv")
# PSS result dWT TV
PSS_result_dWT_TV_annot <- rownames_to_column(as.data.frame(PSS_result_dWT_TV[order(PSS_result_dWT_TV$padj),]))
PSS_result_dWT_TV_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_dWT_TV_annot$rowname,]$featuresOverlap
PSS_result_dWT_TV_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_dWT_TV_annot$rowname,]$TU_overlap
PSS_result_dWT_TV_annot <- PSS_result_dWT_TV_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_dWT_TV_annot, "output/DESeq2_resultsTables/results_dWT-TV_annotated.tsv")
# PSS result WT, TV
PSS_result_WT_TV_annot <- rownames_to_column(as.data.frame(PSS_result_WT_TV[order(PSS_result_WT_TV$padj),]))
PSS_result_WT_TV_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_WT_TV_annot$rowname,]$featuresOverlap
PSS_result_WT_TV_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_WT_TV_annot$rowname,]$TU_overlap
PSS_result_WT_TV_annot <- PSS_result_WT_TV_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_WT_TV_annot, "output/DESeq2_resultsTables/results_PSS-WT-TV_annotated.tsv")
Run in respective Directory (output/DESeq2_resultsTables/) python changeAnnotation_DESeq2.py results_dWT_WT_annotated.tsv results_dWT_WT_annotated-2.tsv python changeAnnotation_DESeq2.py results_dWT-TV_annotated.tsv results_dWT-TV_annotated-2.tsv python changeAnnotation_DESeq2.py results_PSS-WT-TV_annotated.tsv results_PSS-WT-TV_annotated-2.tsv
pdf(file="output/DESeq2_Plots/ddsMat_PSS-dWT-WT_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(PSS_result_dWT_WT, "PSS dWT WT")
dev.off()
## png
## 2
pdf(file="output/DESeq2_Plots/ddsMat_PSS-dWT-TV_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(PSS_result_dWT_TV, "PSS dWT dTV")
dev.off()
## png
## 2
pdf(file="output/DESeq2_Plots/ddsMat_dWT-WT_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(TSS_result_dWT_WT, "TSS dWT WT")
dev.off()
## png
## 2
pdf(file="output/DESeq2_Plots/ddsMat_TSS-dWT-TV_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(TSS_result_dWT_TV, "TSS dWT TV")
dev.off()
## png
## 2
pvaluePlot(PSS_result_dWT_WT, "PSS dWT WT")
pvaluePlot(PSS_result_dWT_TV, "PSS dWT dTV")
pvaluePlot(TSS_result_dWT_WT, "TSS dWT WT")
pvaluePlot(TSS_result_dWT_TV, "TSS dWT TV")
Filter out PSS below filter level of results(). For these, p.adj is set to NA - hence: remove positions at which p.adj==NA
PSS_result_dWT_WT <- subset(PSS_result_dWT_WT, !is.na(PSS_result_dWT_WT$padj))
PSS_result_dWT_TV <- subset(PSS_result_dWT_TV, !is.na(PSS_result_dWT_TV$padj))
PSS_result_WT_TV <- subset(PSS_result_WT_TV, !is.na(PSS_result_WT_TV$padj))
TSS_result_dWT_WT <- subset(TSS_result_dWT_WT, !is.na(TSS_result_dWT_WT$padj))
TSS_result_dWT_TV <- subset(TSS_result_dWT_TV, !is.na(TSS_result_dWT_TV$padj))
TSS_result_WT_TV <- subset(TSS_result_WT_TV, !is.na(TSS_result_WT_TV$padj))
count_up_down(PSS_result_dWT_WT, foldchange=1, padjusted=0.05)
## [1] "number of features down: 7"
## [1] "number of features up: 73"
p <- volcanoPlot_ggplot(as.data.frame(PSS_result_dWT_WT), foldchange=1, padjusted=0.05) + scale_colour_manual(values=c("DOWN"="#000000ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_PSS-0h_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
p <- MAplot_ggplot(PSS_result_dWT_WT, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/WT)") + scale_colour_manual(values=c("DOWN"="#000000ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_PSS-dWT-WT_MAplot.pdf",plot=p, width=15, height=12, units="cm")
count_up_down(PSS_result_dWT_TV, foldchange=1, padjusted=0.05)
## [1] "number of features down: 178"
## [1] "number of features up: 105"
p <- volcanoPlot_ggplot(as.data.frame(PSS_result_dWT_TV), foldchange=1, padjusted=0.05) + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_PSS-dWT-TV_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
p <- MAplot_ggplot(PSS_result_dWT_TV, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/rne(5p))") + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))# + ylim(-5,+5)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_PSS-1h_MAplot.pdf",plot=p, width=15, height=12, units="cm")
count_up_down(PSS_result_WT_TV, foldchange = 1, padjusted = 0.05)
## [1] "number of features down: 239"
## [1] "number of features up: 147"
p <- MAplot_ggplot(PSS_result_WT_TV, foldchange=1.0, y_axis_label = "Log2 fold-change(WT/rne(5p))") + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#000000ff", "NO"="#d3d3d3ff"))# + ylim(-5,+5)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_PSS-WT-TV_MAplot.pdf",plot=p, width=15, height=12, units="cm")
count_up_down(TSS_result_dWT_WT, foldchange=1, padjusted=0.05)
## [1] "number of features down: 77"
## [1] "number of features up: 83"
p <- volcanoPlot_ggplot(as.data.frame(TSS_result_dWT_WT), foldchange=1, padjusted=0.05) + scale_colour_manual(values=c("DOWN"="#000000ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_TSS-0h_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
p <- MAplot_ggplot(TSS_result_dWT_WT, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/WT)") + scale_colour_manual(values=c("DOWN"="#000000ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_TSS-dWT-WT_MAplot.pdf",plot=p, width=15, height=12, units="cm")
count_up_down(TSS_result_dWT_TV, foldchange=1, padjusted=0.05)
## [1] "number of features down: 18"
## [1] "number of features up: 14"
p <- volcanoPlot_ggplot(as.data.frame(TSS_result_dWT_TV), foldchange=1, padjusted=0.05) + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_TSS-dWT-TV_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
p <- MAplot_ggplot(TSS_result_dWT_TV, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/rne(5p))") + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))# + ylim(-5,+5)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_TSS-1h_MAplot.pdf",plot=p, width=15, height=12, units="cm")
count_up_down(TSS_result_WT_TV, foldchange = 1, padjusted = 0.05)
## [1] "number of features down: 140"
## [1] "number of features up: 212"
p <- MAplot_ggplot(TSS_result_WT_TV, foldchange=1.0, y_axis_label = "Log2 fold-change(WT/rne(5p))") + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#000000ff", "NO"="#d3d3d3ff"))# + ylim(-5,+5)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
ggsave("output/DESeq2_Plots/ddsMat_TSS-WT-TV_MAplot.pdf",plot=p, width=15, height=12, units="cm")
TSSPSS_at_TSSpositions <- counts(ddsMat_PSSTSS, normalize=FALSE)[names_prob_TSS,] # normalize= TRUE/FALSE: should not make difference, but decided to use FALSE since should introduce less bias
# calculate ratios
dWT1_ratio=TSSPSS_at_TSSpositions[,"dWT1_PSS"]/TSSPSS_at_TSSpositions[,"dWT1_TSS"]
dWT2_ratio=TSSPSS_at_TSSpositions[,"dWT2_PSS"]/TSSPSS_at_TSSpositions[,"dWT2_TSS"]
dWT3_ratio=TSSPSS_at_TSSpositions[,"dWT3_PSS"]/TSSPSS_at_TSSpositions[,"dWT3_TSS"]
WT1_ratio=TSSPSS_at_TSSpositions[,"WT1_PSS"]/TSSPSS_at_TSSpositions[,"WT1_TSS"]
WT2_ratio=TSSPSS_at_TSSpositions[,"WT2_PSS"]/TSSPSS_at_TSSpositions[,"WT2_TSS"]
WT3_ratio=TSSPSS_at_TSSpositions[,"WT3_PSS"]/TSSPSS_at_TSSpositions[,"WT3_TSS"]
TV1_ratio=TSSPSS_at_TSSpositions[,"TV1_PSS"]/TSSPSS_at_TSSpositions[,"TV1_TSS"]
TV2_ratio=TSSPSS_at_TSSpositions[,"TV2_PSS"]/TSSPSS_at_TSSpositions[,"TV2_TSS"]
# create df for plotting
df_TSSPSS_ratio <- data.frame(sample=c(rep("dWT1", length(names_prob_TSS)),rep("dWT2", length(names_prob_TSS)),rep("dWT3", length(names_prob_TSS)),rep("WT1", length(names_prob_TSS)),rep("WT2", length(names_prob_TSS)),rep("WT3", length(names_prob_TSS)),rep("TV1", length(names_prob_TSS)),rep("TV2", length(names_prob_TSS))), type=c(rep("rne(WT)", 3*length(names_prob_TSS)), rep("WT", 3*length(names_prob_TSS)), rep("rne(5p)", 2*length(names_prob_TSS))), TSSPSS_ratio <- c(dWT1_ratio, dWT2_ratio, dWT3_ratio, WT1_ratio, WT2_ratio, WT3_ratio, TV1_ratio, TV2_ratio))
p <- ggplot(data=df_TSSPSS_ratio, aes(x=sample, y=TSSPSS_ratio, fill=type)) + geom_violin() +
geom_boxplot(width=0.1, color="grey", alpha=0.2, outlier.shape=NA, show.legend = FALSE) +
scale_fill_viridis(discrete=TRUE, alpha=0.6, option="A") +
theme_light() +
labs(fill="", y="PSS/TSS at TSS positions", x="")
p
## Warning: Removed 8 rows containing non-finite values (stat_ydensity).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
df_TSSPSS_ratio_2 <- subset(df_TSSPSS_ratio, !is.na(df_TSSPSS_ratio$TSSPSS_ratio) & !is.infinite(df_TSSPSS_ratio$TSSPSS_ratio))
df_TSSPSS_ratio_2$TSSPSS_ratio <- df_TSSPSS_ratio_2$TSSPSS_ratio + 0.0001
p <- ggplot(data=df_TSSPSS_ratio_2, aes(x=sample, y=TSSPSS_ratio, fill=type)) + geom_violin() +
geom_boxplot(width=0.1, color="grey", alpha=0.2, outlier.shape=NA, show.legend = FALSE) +
scale_fill_viridis(discrete=TRUE, alpha=0.6, option="A") +
theme_light() +
labs(fill="", y="PSS/TSS at TSS positions", x="") + scale_y_continuous(trans="log10", limits=c(NA,10), breaks=c(0,0.01,0.1,1,10))
p
advanced_reduce_withBaseMeans() merges adjacent PSS positions. In a second step, peaks which are wider than 1nt are reduced to a 1nt wide position according to the baseMeans of the peaks - the most highly populated position is given. PSS_Ranges_WT are peaks which are higher in rne(WT), PSS_Ranges_TV peaks which are higher in rne(5p). Since there are so few peaks differentially populated in the comparison between rne(WT) and WT, these two sets of peaks will not be analysed more specifically.
PSS_Ranges_WT <- advanced_reduce_withBaseMeans(create_GRanges_object_from_resObject(PSS_result_dWT_TV, foldchange=1, padjusted=0.05))
PSS_Ranges_TV <- advanced_reduce_withBaseMeans(create_GRanges_object_from_resObject(PSS_result_dWT_TV, foldchange=(-1), padjusted=0.05, up=FALSE))
PSS_Ranges <- c(PSS_Ranges_WT, PSS_Ranges_TV)
peaks_all <- extract_RNA_sequences(PSS_Ranges, syne_fasta, 5)
peaks_WT <- extract_RNA_sequences(PSS_Ranges_WT, syne_fasta, 5)
peaks_TV <- extract_RNA_sequences(PSS_Ranges_TV, syne_fasta, 5)
writeXStringSet(peaks_all, "output/rneWT_rne5p_Weblogo/all_peaks_5upDownstream.fasta", append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(peaks_WT, "output/rneWT_rne5p_Weblogo/peaks_WT_5upDownstream.fasta", append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(peaks_TV, "output/rneWT_rne5p_Weblogo/peaks_TV_5upDownstream.fasta", append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
sum((apply(oligonucleotideFrequency(syne_fasta, width=1), 2, sum)/sum(apply(oligonucleotideFrequency(syne_fasta, width=1),2,sum)))[2:3]) # get GC-content for weblogo
## [1] 0.473662
run createWebLogos.txt for creation of sequence logos
Prepare data frames for the plots and count how many PSS are overlapping with which sequence and with how many of the respective features:
# prepare data.frame for barplot
sequences <- names(syne_fasta)
updown <- c(rep("rne(WT)",length(sequences)), rep("rne(5p)",length(sequences)))
df_PSS_sequence_barplot <- data.frame(cbind(sequence=sequences, updown=updown))
df_PSS_sequence_barplot$updown <- factor(df_PSS_sequence_barplot$updown, levels=c("rne(WT)", "rne(5p)"))
df_PSS_sequence_barplot$x_labels <- NA
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="BA000022.2",]$x_labels <- "Chr"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP004311.1",]$x_labels <- "pSYSA"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP004312.1",]$x_labels <- "pSYSG"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP004310.1",]$x_labels <- "pSYSM"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP006585.1",]$x_labels <- "pSYSX"
df_PSS_sequence_barplot$x_labels <- factor(df_PSS_sequence_barplot$x_labels, levels=c("Chr", "pSYSA", "pSYSG", "pSYSM", "pSYSX"))
df_PSS_sequence_barplot$geno_size <- NA
for(i in df_PSS_sequence_barplot$sequence){
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence==i,]$geno_size <- seqlengths(syne_fasta)[i]
}
df_PSS_sequence_barplot$geno_relative_size <- df_PSS_sequence_barplot$geno_size/sum(seqlengths(syne_fasta))*100
df_PSS_sequence_barplot$number_feat_overlap <- NA
df_PSS_sequence_barplot$number_PSS_overlap <- NA
df_PSS_sequence_barplot$number_feat_total <- NA
df_PSS_sequence_barplot$PSS_kb <- NA
# prepare data frame for number of overlaps with certain features
df_PSS_sequence_overlap_perKb <- data.frame()
# extract number of overlaps with different sequences and features, both for up- and downregulated PSS positions
for(s in sequences){
index_up <- which(df_PSS_sequence_barplot$sequence==s & df_PSS_sequence_barplot$updown=="rne(WT)")
index_down <- which(df_PSS_sequence_barplot$sequence==s & df_PSS_sequence_barplot$updown=="rne(5p)")
df_PSS_sequence_barplot[index_up,"number_feat_overlap"] <- sum(countOverlaps(subset(features, seqnames(features)==s), PSS_Ranges_WT)>0) # count number of features affected
df_PSS_sequence_barplot[index_up,"number_PSS_overlap"] <- length(subset(PSS_Ranges_WT, seqnames(PSS_Ranges_WT)==s))
df_PSS_sequence_barplot[index_up,"PSS_kb"] <- length(subset(PSS_Ranges_WT, seqnames(PSS_Ranges_WT)==s))/(seqlengths(syne_fasta[s])/1000)
df_PSS_sequence_barplot[index_down,"number_feat_overlap"] <- sum(countOverlaps(subset(features, seqnames(features)==s), PSS_Ranges_TV)>0) # count number of features affected
df_PSS_sequence_barplot[index_down,"number_PSS_overlap"] <- length(subset(PSS_Ranges_TV, seqnames(PSS_Ranges_TV)==s))
df_PSS_sequence_barplot[index_down,"PSS_kb"] <- length(subset(PSS_Ranges_TV, seqnames(PSS_Ranges_TV)==s))/(seqlengths(syne_fasta[s])/1000)
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence==s,"number_feat_total"] <- length(subset(features, seqnames(features)==s))
}
df_PSS_sequence_barplot$perc_feat <- (df_PSS_sequence_barplot$number_feat_overlap/df_PSS_sequence_barplot$number_feat_total)*100
df_PSS_sequence_barplot$perc_all_feat <- NA
df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(WT)",]$perc_all_feat <- df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(WT)",]$number_PSS_overlap/sum(df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(WT)",]$number_PSS_overlap)*100
df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(5p)",]$perc_all_feat <- df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(5p)",]$number_PSS_overlap/sum(df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(5p)",]$number_PSS_overlap)*100
df_PSS_sequence_barplot
## sequence updown x_labels geno_size geno_relative_size number_feat_overlap
## 1 AP004311.1 rne(WT) pSYSA 103307 2.617342 2
## 2 AP006585.1 rne(WT) pSYSX 106004 2.685672 0
## 3 AP004310.1 rne(WT) pSYSM 119895 3.037609 0
## 4 AP004312.1 rne(WT) pSYSG 44343 1.123455 0
## 5 BA000022.2 rne(WT) Chr 3573470 90.535921 59
## 6 AP004311.1 rne(5p) pSYSA 103307 2.617342 0
## 7 AP006585.1 rne(5p) pSYSX 106004 2.685672 2
## 8 AP004310.1 rne(5p) pSYSM 119895 3.037609 0
## 9 AP004312.1 rne(5p) pSYSG 44343 1.123455 0
## 10 BA000022.2 rne(5p) Chr 3573470 90.535921 69
## number_PSS_overlap number_feat_total PSS_kb perc_feat perc_all_feat
## 1 4 111 0.038719545 1.801802 4.0404040
## 2 0 112 0.000000000 0.000000 0.0000000
## 3 0 134 0.000000000 0.000000 0.0000000
## 4 0 51 0.000000000 0.000000 0.0000000
## 5 95 5726 0.026584804 1.030388 95.9595960
## 6 2 111 0.019359772 0.000000 1.3071895
## 7 1 112 0.009433606 1.785714 0.6535948
## 8 1 134 0.008340631 0.000000 0.6535948
## 9 0 51 0.000000000 0.000000 0.0000000
## 10 149 5726 0.041696166 1.205030 97.3856209
write.csv(df_PSS_sequence_barplot, file="output/overlap_contigs_PSS.csv")
# Plot: PSS per kb
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=x_labels,y=PSS_kb, fill=updown)) + geom_bar(stat="identity", position="dodge") + theme_light()+ xlab("") + ylab("PSS/kb") + scale_fill_manual(name="", values=c("#005a96ff", "#e69f00ff"), breaks=c("rne(WT)", "rne(5p)"), labels=c("rne(WT)", "rne(5p)"))
p
ggsave(filename = "output/barplots_overlapFeatures/barplot_PSSkb_chr_plasmids.pdf", plot = p, width = 14, height = 7, units = "cm")
# Plot: How many features of a certain kind are overlapped by PSS?
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=x_labels,y=perc_feat, fill=updown)) + geom_bar(stat="identity", position="dodge") + theme_light()+ xlab("") + ylab("Features overlapped by PSS (%)") + scale_fill_manual(name="", values=c("#005a96ff", "#e69f00ff"), breaks=c("rne(WT)", "rne(5p)"), labels=c("rne(WT)", "rne(5p)"))
p
ggsave(filename = "output/barplots_overlapFeatures/barplot_percentageFeatures_chr_plasmids.pdf", plot = p, width = 14, height = 7, units = "cm")
# Plots: How many PSS are localized in which kind of feature?
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=updown,y=number_PSS_overlap, fill=x_labels)) + geom_bar(stat="identity", position="stack") + theme_light()+ xlab("") + ylab("Percentage of all PSS (%)") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p
ggsave(filename = "output/barplots_overlapFeatures/barplot_absolute_chromo_plasmids_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=updown,y=number_PSS_overlap, fill=x_labels)) + geom_bar(stat="identity", position="fill") + theme_light()+ xlab("") + ylab("Number PSS") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p
ggsave(filename = "output/barplots_overlapFeatures/barplot_perc_chromo_plasmids_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")
For comparison, plot lengths of different parts of genome:
seqlengths_syne <- as.data.frame(seqlengths(syne_fasta))
seqlengths_syne$name <- NA
seqlengths_syne["BA000022.2",]$name <- "Chr"
seqlengths_syne["AP004311.1",]$name <- "pSYSA"
seqlengths_syne["AP004312.1",]$name <- "pSYSG"
seqlengths_syne["AP004310.1",]$name <- "pSYSM"
seqlengths_syne["AP006585.1",]$name <- "pSYSX"
seqlengths_syne$x <- "a"
p <- ggplot(data=seqlengths_syne, aes(x=x, y=seqlengths(syne_fasta), fill=name)) + geom_bar(stat="identity", position="fill") + theme_light()+ xlab("") + ylab("Percentage of total length") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p
ggsave(filename = "output/barplots_overlapFeatures/barplot_perc_chromo_plasmids_lengths.pdf", plot = p, width = 6, height = 7, units = "cm")
Prepare data frames for the plots and count how many PSS are overlapping with which features, how many features of a certain type are overlapped by PSS, how many PSS are located in each feature per kb.
# prepare data.frame for barplot
types <- rep(c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),3)
updown <- c(rep("up",9), rep("down",9), rep("both",9))
df_PSS_features <- data.frame(cbind(type=types, updown=updown))
df_PSS_features$number_feat_overlap <- NA
df_PSS_features$number_PSS_overlap <- NA
df_PSS_features$number_total <- NA
# prepare data frame for number of overlaps with certain features
df_PSS_features_overlap_perKb <- data.frame()
# extract number of overlaps with different feature types, both for up- and downregulated PSS positions
types <- c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc")
for(t in types){
t_feat <- which(features$type == t)
index_up <- which(df_PSS_features$type==t & df_PSS_features$updown=="up")
index_down <- which(df_PSS_features$type==t & df_PSS_features$updown=="down")
index_both <- which(df_PSS_features$type==t & df_PSS_features$updown=="both")
df_PSS_features[index_up,"number_feat_overlap"] <- sum(countOverlaps(features[t_feat], PSS_Ranges_WT)>0) # count number of features affected
df_PSS_features[index_up,"number_PSS_overlap"] <- sum(countOverlaps(PSS_Ranges_WT, features[t_feat])>0) # count where PSS are located
df_PSS_features[index_down,"number_feat_overlap"] <- sum(countOverlaps(features[t_feat], PSS_Ranges_TV)>0) # count number of features affected
df_PSS_features[index_down,"number_PSS_overlap"] <- sum(countOverlaps(PSS_Ranges_TV, features[t_feat])>0) # count where PSS are located
df_PSS_features[index_both,"number_feat_overlap"] <- sum(countOverlaps(features[t_feat], PSS_Ranges_TV)>0 | countOverlaps(features[t_feat], PSS_Ranges_WT)>0) # count number of features affected
df_PSS_features[index_both,"number_PSS_overlap"] <- sum(countOverlaps(PSS_Ranges_TV, features[t_feat])>0) + sum(countOverlaps(PSS_Ranges_WT, features[t_feat])>0) # count where PSS are located
df_PSS_features[df_PSS_features$type==t,"number_total"] <- length(features[t_feat])
counts_per_kb <- countOverlaps(features[t_feat], PSS_Ranges_WT)/(width(features[t_feat])/1000)
df_PSS_features_overlap_perKb <- rbind(df_PSS_features_overlap_perKb, data.frame(type=rep(paste(t, "-up", sep=""),length(features[t_feat])), counts_per_kb=counts_per_kb))
counts_per_kb <- countOverlaps(features[t_feat], PSS_Ranges_TV)/(width(features[t_feat])/1000)
df_PSS_features_overlap_perKb <- rbind(df_PSS_features_overlap_perKb, data.frame(type=rep(paste(t, "-down", sep=""),length(features[t_feat])), counts_per_kb=counts_per_kb))
}
df_PSS_features
## type updown number_feat_overlap number_PSS_overlap number_total
## 1 CDS up 53 83 3675
## 2 5UTR up 4 4 979
## 3 3UTR up 0 0 29
## 4 tRNA up 3 3 42
## 5 rRNA up 0 0 6
## 6 ncRNA up 0 0 318
## 7 asRNA up 1 1 1071
## 8 CRISPR up 0 0 3
## 9 misc up 0 0 11
## 10 CDS down 62 117 3675
## 11 5UTR down 6 7 979
## 12 3UTR down 1 2 29
## 13 tRNA down 1 1 42
## 14 rRNA down 0 0 6
## 15 ncRNA down 1 2 318
## 16 asRNA down 0 0 1071
## 17 CRISPR down 0 0 3
## 18 misc down 0 0 11
## 19 CDS both 93 200 3675
## 20 5UTR both 10 11 979
## 21 3UTR both 1 2 29
## 22 tRNA both 4 4 42
## 23 rRNA both 0 0 6
## 24 ncRNA both 1 2 318
## 25 asRNA both 1 1 1071
## 26 CRISPR both 0 0 3
## 27 misc both 0 0 11
write.csv(df_PSS_features, file="output/overlap_RNAfeatures_PSS.csv")
for TUs:
sum(countOverlaps(TUs, PSS_Ranges_WT)>0)
## [1] 63
sum(countOverlaps(TUs, PSS_Ranges_TV)>0)
## [1] 82
sum((countOverlaps(TUs, PSS_Ranges_WT)|countOverlaps(TUs, PSS_Ranges_TV))>0)
## [1] 119
Do further processing of data frames.
df_PSS_features_barplot <- subset(df_PSS_features, df_PSS_features$updown=="up" | df_PSS_features$updown=="down")
total_overlaps_up <- sum(countOverlaps(PSS_Ranges_WT, features)>0)
total_overlaps_down <- sum(countOverlaps(PSS_Ranges_TV, features)>0)
df_PSS_features_barplot[,"number_feat_overlap"] <- as.integer(df_PSS_features_barplot[,"number_feat_overlap"])/as.integer(df_PSS_features_barplot[,"number_total"])*100
# create column with absolute numbers
df_PSS_features_barplot$number_PSS_overlap_absolute <- NA
df_PSS_features_barplot[df_PSS_features_barplot$updown=="up",]$number_PSS_overlap_absolute <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="up","number_PSS_overlap"])
df_PSS_features_barplot[df_PSS_features_barplot$updown=="down",]$number_PSS_overlap_absolute <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="down","number_PSS_overlap"])
# calculate percentages
df_PSS_features_barplot[df_PSS_features_barplot$updown=="up","number_PSS_overlap"] <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="up","number_PSS_overlap"])/total_overlaps_up *100
df_PSS_features_barplot[df_PSS_features_barplot$updown=="down","number_PSS_overlap"] <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="down","number_PSS_overlap"])/total_overlaps_down *100
# rename types (5'UTR instead of 5UTR, 3'UTR instead of 3UTR)
df_PSS_features_barplot[,"type"] <- rep(c("CDS","5'UTR", "3'UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),2)
# reduce plot to information I actually care about
df_PSS_features_barplot <- df_PSS_features_barplot[which(!df_PSS_features_barplot$type=="rRNA"),] # rRNAs cannot be counted here, because multireads were excluded from analysis
df_PSS_features_barplot <- df_PSS_features_barplot[which(!df_PSS_features_barplot$type=="misc"),] # misc are actually only genes which are split by plasmid / chromosome beginning and two features (NC-2306445, NC-3547510) where I never figured out what they are. One PSS overlaps with sll8049-1. So I think one can savely ignore those
Do actual plotting.
# cast type, updown as factors to influence order in which they show up in the plots
df_PSS_features_barplot$type <- factor(df_PSS_features_barplot$type, levels=c("tRNA", "asRNA", "ncRNA", "CRISPR", "3'UTR", "5'UTR", "CDS"))
df_PSS_features_barplot$updown <- c(rep("rne(WT)", 7), rep("rne(5p)",7))
df_PSS_features_barplot$updown <- factor(df_PSS_features_barplot$updown, levels=c("rne(WT)", "rne(5p)"))
# Plot: How many features of a certain kind are overlapped by PSS?
p <- ggplot(data=df_PSS_features_barplot, aes(x=type,y=number_feat_overlap, fill=updown)) + geom_bar(stat="identity", position="dodge") + theme_light()+ xlab("") + ylab("Percentage PSS (%)") + scale_fill_manual(name="", values=c("#005a96ff", "#e69f00ff"), breaks=c("rne(WT)", "rne(5p)"), labels=c("rne(WT)", "rne(5p)"))
p
ggsave(filename = "output/barplots_overlapFeatures/barplot_percentage_of_allThisFeature.pdf", plot = p, width = 14, height = 7, units = "cm")
# Plots: How many PSS are localized in which kind of feature?
p <- ggplot(data=df_PSS_features_barplot, aes(x=updown,y=number_PSS_overlap, fill=type)) + geom_bar(stat="identity", position="stack") + theme_light()+ xlab("") + ylab("Percentage of all PSS (%)") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p
ggsave(filename = "output/barplots_overlapFeatures/barplot_percentage_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")
p <- ggplot(data=df_PSS_features_barplot, aes(x=updown,y=number_PSS_overlap_absolute, fill=type)) + geom_bar(stat="identity", position="stack") + theme_light()+ xlab("") + ylab("Number PSS") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p
ggsave(filename = "output/barplots_overlapFeatures/barplot_absolute_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")
Exclude rRNA, tRNA, misc from plots since not much information in those.
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="rRNA-up"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="rRNA-down"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="tRNA-up"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="tRNA-down"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="misc-up"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="misc-down"),]
Violin plot for a first overview.
p <- ggplot(data=df_PSS_features_overlap_perKb, aes(x=type, y=counts_per_kb, fill=type)) + geom_violin() +
geom_boxplot(width=0.1, color="grey", alpha=0.2, outlier.shape=NA, show.legend = FALSE) +
scale_fill_viridis(discrete=TRUE, alpha=0.6, option="A") +
theme_light() +
labs(fill="", y="Number of PSS per feature per kb", x="")
p
Plot with log-transformation + statistics for all features (add pseudocount of 0.0001 to avoid 0-problem of log-transformation).
df_PSS_features_overlap_perKb$type <- as.factor(df_PSS_features_overlap_perKb$type)
magma_colors <- magma(length(levels(df_PSS_features_overlap_perKb$type))/2, alpha = 0.6, begin = 0, end = 1, direction = 1)[c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9)]
p <- ggplot(data=df_PSS_features_overlap_perKb, aes(x=type, y=counts_per_kb +0.0001, fill=type)) +
geom_boxplot(outlier.shape=NA) + geom_jitter(color="black", size=0.05, alpha=0.9, show.legend=FALSE)+
scale_fill_manual(values=magma_colors)+
theme_light()+ theme(legend.position = "none") +
labs(fill="", y="Number of PSS per kb", x="") + scale_y_continuous(trans="log10", limits=c(NA,100), breaks=c(0,2,4,8,16,32,64)) + scale_x_discrete(breaks=c("3UTR-down", "3UTR-up", "5UTR-down", "5UTR-up", "asRNA-down", "asRNA-up", "CDS-down", "CDS-up", "CRISPR-down", "CRISPR-up", "ncRNA-down","ncRNA-up"), labels=c("Ts\n3'UTR", "WT\n3'UTR", "Ts\n5'UTR", "WT\n5'UTR", "Ts\nasRNA", "WT\nasRNA", "Ts\nCDS", "WT\nCDS", "Ts\nCRISPR", "WT\nCRISPR", "Ts\nncRNA", "WT\nncRNA"))
# labels=rep(c("Ts", "WT"), 6))
p
Plot with log-transformation, without pseudocounts: Boxplots show statistics for features which are cleaved at all, not for all features irrespective if cleaved or not.
df_PSS_features_overlap_perKb$type <- as.factor(df_PSS_features_overlap_perKb$type)
magma_colors <- magma(length(levels(df_PSS_features_overlap_perKb$type))/2, alpha = 0.6, begin = 0, end = 1, direction = 1)[c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9)]
p <- ggplot(data=df_PSS_features_overlap_perKb, aes(x=type, y=counts_per_kb, fill=type)) +
geom_boxplot(outlier.shape=NA) + geom_jitter(color="black", size=0.05, alpha=0.9, show.legend=FALSE)+
scale_fill_manual(values=magma_colors)+
theme_light()+ theme(legend.position = "none") +
labs(fill="", y="Number of PSS per kb", x="") + scale_y_continuous(trans="log10") + scale_x_discrete(breaks=c("3UTR-down", "3UTR-up", "5UTR-down", "5UTR-up", "asRNA-down", "asRNA-up", "CDS-down", "CDS-up", "CRISPR-down", "CRISPR-up", "ncRNA-down","ncRNA-up"), labels=c("Ts\n3'UTR", "WT\n3'UTR", "Ts\n5'UTR", "WT\n5'UTR", "Ts\nasRNA", "WT\nasRNA", "Ts\nCDS", "WT\nCDS", "Ts\nCRISPR", "WT\nCRISPR", "Ts\nncRNA", "WT\nncRNA"))
# labels=rep(c("Ts", "WT"), 6))
p
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 12022 rows containing non-finite values (stat_boxplot).
ggsave("output/barplots_overlapFeatures/violin_NumberPSS_perKb.pdf", plot=p, width = 14, height = 7, units="cm")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 12022 rows containing non-finite values (stat_boxplot).
Statistics for different feature types:
summary(df_PSS_features_overlap_perKb[which(grepl("CRISPR-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
summary(df_PSS_features_overlap_perKb[which(grepl("CRISPR-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
summary(df_PSS_features_overlap_perKb[which(grepl("CDS-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03747 0.00000 12.34568
summary(df_PSS_features_overlap_perKb[which(grepl("CDS-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06156 0.00000 19.60784
summary(df_PSS_features_overlap_perKb[which(grepl("asRNA-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.005335 0.000000 5.714286
summary(df_PSS_features_overlap_perKb[which(grepl("asRNA-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
summary(df_PSS_features_overlap_perKb[which(grepl("ncRNA-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
summary(df_PSS_features_overlap_perKb[which(grepl("ncRNA-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07487 0.00000 23.80952
summary(df_PSS_features_overlap_perKb[which(grepl("5UTR-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07112 0.00000 32.25806
summary(df_PSS_features_overlap_perKb[which(grepl("5UTR-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1611 0.0000 43.4783
summary(df_PSS_features_overlap_perKb[which(grepl("3UTR-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
summary(df_PSS_features_overlap_perKb[which(grepl("3UTR-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3789 0.0000 10.9890
summary(df_PSS_features_overlap_perKb[which(grepl("up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03507 0.00000 32.25806
summary(df_PSS_features_overlap_perKb[which(grepl("down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06893 0.00000 43.47826
Control if, in general, PSS are rather detected in certain set of genes. GO annotation only exists for CDS, not for other features: Therefore, only use CDS for functional enrichment.
Detectable PSS are enriched in genes associated with photosynthesis or ribosomes. I assume both sets are, compared to other sets of genes, highly transcribed and that’s the reason why PSS can be detected in those genes. Probably, there are also stable processed RNA fragments created from other sets of transcripts, but they might be too lowly transcribed to be captured by tagRNA-Seq.
CDS_features <- features[which(features$type=="CDS")]
locus_tags_all_PSS <- CDS_features[which(countOverlaps(CDS_features, PSS_advReduce_Ranges)>0)]$locus_tag
# compared to all CDS
kegg_PSS_all_enrich <- kegg_functional_enrichment_locusList(locus_tags_all_PSS, CDS_features$locus_tag, write=TRUE, path="output/enrichment/KEGG_allPSS.csv")
## Reading KEGG annotation online:
##
## Reading KEGG annotation online:
## ID Description GeneRatio BgRatio
## syn00195 syn00195 Photosynthesis 37/330 63/953
## syn03010 syn03010 Ribosome 32/330 54/953
## syn00196 syn00196 Photosynthesis - antenna proteins 12/330 15/953
## syn00190 syn00190 Oxidative phosphorylation 28/330 49/953
## pvalue p.adjust qvalue
## syn00195 4.488607e-05 0.002782937 0.002504170
## syn03010 1.220216e-04 0.003782668 0.003403759
## syn00196 3.866458e-04 0.007990680 0.007190255
## syn00190 7.756333e-04 0.012022316 0.010818044
go_PSS_all_enrich <- go_functional_enrichment_locusList(locus_tags_all_PSS, CDS_features$locus_tag, write=TRUE, path="output/enrichment/GO_allPSS.csv")
## ID Description GeneRatio BgRatio
## GO:0042651 GO:0042651 thylakoid membrane 58/724 96/2593
## GO:0018298 GO:0018298 protein-chromophore linkage 22/724 31/2593
## GO:0003735 GO:0003735 structural constituent of ribosome 33/724 56/2593
## GO:0015979 GO:0015979 photosynthesis 45/724 86/2593
## GO:0048038 GO:0048038 quinone binding 17/724 22/2593
## GO:0006412 GO:0006412 translation 45/724 90/2593
## pvalue p.adjust qvalue
## GO:0042651 1.179467e-11 2.040478e-09 1.614008e-09
## GO:0018298 6.803585e-07 4.424105e-05 3.499444e-05
## GO:0003735 8.886063e-07 4.424105e-05 3.499444e-05
## GO:0015979 1.022914e-06 4.424105e-05 3.499444e-05
## GO:0048038 1.957421e-06 6.772676e-05 5.357152e-05
## GO:0006412 5.197638e-06 1.367076e-04 1.081350e-04
## geneID
## GO:0042651 slr1311/sll0223/sll1327/sll1326/sll1324/sll1323/ssl2615/sll1322/slr1291/smr0005/slr1604/ssr3451/slr1034/slr1390/sll1814/ssl3335/slr1834/slr1835/sll1463/sll1281/ssl2501/sml0007/sll1697/sll0851/sll0849/slr0261/slr1513/slr1329/slr1330/sml0008/ssr1386/slr0823/sll1262/sll1867/slr1279/slr1280/slr1281/ssr2831/slr1645/sll0258/slr1949/ssl0563/slr0331/sml0001/smr0001/slr0342/slr0343/sll0199/slr0228/sll0689/slr0906/ssl1690/slr0083/slr0927/sll0522/sll0520/sll0519/smr0004
## GO:0018298 slr1311/sll0928/slr0687/sll1578/sll1577/sll1124/slr1834/slr1835/slr0854/sll0851/sll0849/slr2067/slr1986/slr1963/slr1969/sll1867/slr0335/slr0473/slr0906/sll0041/slr0927/slr1459
## GO:0003735 ssr1604/ssl1784/ssr1736/sll1824/ssl3445/sll1821/sll1819/sll1816/sml0006/sll1813/sll1808/sll1803/sll1802/ssl2233/sll1746/sll1744/sll1740/slr1356/sll1767/sll1260/sll1244/slr1984/sll1101/sll1097/sll1096/ssr2799/sll0767/ssl1426/ssl0601/slr0469/slr0628/slr0923/ssr1398
## GO:0015979 sll1214/sll1398/slr0737/sll1032/sll1031/sll1029/sll1028/sll1194/sll1184/sll0928/smr0005/slr1055/sll1638/ssr3451/slr2051/ssl3093/sll1580/sll1578/sll1577/slr1834/slr1835/sll1281/sml0007/slr2067/slr1986/ssr3383/sll1418/sml0008/sll0819/slr0823/slr1347/sll1091/ssr2831/sll0427/sll0258/slr0335/sml0001/slr0772/smr0001/slr0342/slr0436/slr0525/slr1459/sll1471/smr0004
## GO:0048038 sll0223/slr2007/sll1732/slr0261/ssr1386/sll1262/slr1279/slr1280/slr1281/slr0331/ssl1690/slr0565/slr0844/sll0522/sll0521/sll0520/sll0519
## GO:0006412 ssr1604/ssl1784/ssr1736/sll1824/ssl3445/sll1821/sll1819/sll1816/sml0006/sll1813/sll1808/sll1803/sll1802/ssl2233/sll1746/sll1744/sll1740/slr1550/sll1362/slr1356/slr0877/sll1767/slr1720/sll1425/sll1260/sll1244/slr1984/sll1435/sll1101/sll1097/sll1096/ssr2799/sll1553/sll0145/sll0767/ssl1426/ssl0601/slr0469/slr0628/slr0638/slr0923/sll0546/ssr1398/slr1560/slr0604
## Count
## GO:0042651 58
## GO:0018298 22
## GO:0003735 33
## GO:0015979 45
## GO:0048038 17
## GO:0006412 45
p <- dotplot(kegg_PSS_all_enrich)
## wrong orderBy parameter; set to default `orderBy = "x"`
p
ggsave("output/enrichment/KEGG_dotplot_allPSS.pdf", plot=p, width=20, height=14, units="cm")
p <- dotplot(go_PSS_all_enrich, showCategory=26)
## wrong orderBy parameter; set to default `orderBy = "x"`
p
ggsave("output/enrichment/GO_dotplot_allPSS.pdf", plot=p, width=20, height=14, units="cm")
Compared to the positions where PSS were generally detected, no terms are enriched in set of PSS upregulated (higher in rne(WT)).
# enrichment in rne(WT) PSS
locus_tags_up <- CDS_features[which(countOverlaps(CDS_features, PSS_Ranges_WT)>0)]$locus_tag
# compared to all CDS
kegg_PSS_up_enrich <- kegg_functional_enrichment_locusList(locus_tags_up, CDS_features$locus_tag, write=TRUE, path="output/enrichment/KEGG_PSS_rneWT_universeAllCDS.csv")
## ID Description GeneRatio BgRatio
## syn00196 syn00196 Photosynthesis - antenna proteins 4/21 15/953
## pvalue p.adjust qvalue
## syn00196 0.0002040451 0.003060677 0.002577412
go_PSS_up_enrich <- go_functional_enrichment_locusList(locus_tags_up, CDS_features$locus_tag, write=TRUE, path="output/enrichment/GO_PSS_rneWT_universeAllCDS.csv")
## ID Description GeneRatio BgRatio
## GO:0030089 GO:0030089 phycobilisome 5/41 24/2593
## GO:0018298 GO:0018298 protein-chromophore linkage 5/41 31/2593
## GO:0015979 GO:0015979 photosynthesis 7/41 86/2593
## pvalue p.adjust qvalue
## GO:0030089 2.622924e-05 0.001914734 0.001822242
## GO:0018298 9.663725e-05 0.003527260 0.003356873
## GO:0015979 3.105628e-04 0.007557028 0.007191981
## geneID Count
## GO:0030089 slr2051/sll1578/slr1986/slr1963/slr0335 5
## GO:0018298 sll1578/slr1834/slr1986/slr1963/slr0335 5
## GO:0015979 sll1214/sll1184/slr2051/sll1578/slr1834/slr1986/slr0335 7
# compared to universe: Where are PSS generally localized / what was detectable
kegg_PSS_up_enrich_2 <- kegg_functional_enrichment_locusList(list_of_genes=locus_tags_up, universe_tags=locus_tags_all_PSS, write=TRUE, path="output/enrichment/KEGG_PSS_rneWT_universeDetectedCDS.csv")
## [1] ID Description GeneRatio BgRatio pvalue p.adjust
## [7] qvalue
## <0 rows> (or 0-length row.names)
go_PSS_up_enrich_2 <- go_functional_enrichment_locusList(locus_tags_up, universe_tags=locus_tags_all_PSS, write=TRUE, path="output/enrichment/GO_PSS_rneWT_universeDetectedCDS.csv")
## ID Description GeneRatio BgRatio pvalue p.adjust
## GO:0030089 GO:0030089 phycobilisome 5/41 14/724 0.0006253973 0.02689209
## qvalue geneID Count
## GO:0030089 0.02633252 slr2051/sll1578/slr1986/slr1963/slr0335 5
When using all positions of PSS as universe, GO term phycobilisome is enriched in the PSS ranges rne(WT) data set.
# enrichment in rne(TV) PSS
locus_tags_down <- CDS_features[which(countOverlaps(CDS_features, PSS_Ranges_TV)>0)]$locus_tag
# compared to all CDS
kegg_PSS_down_enrich <- kegg_functional_enrichment_locusList(locus_tags_down, CDS_features$locus_tag, write=TRUE, path="output/enrichment/KEGG_PSS_TV_universeAllCDS.csv")
## ID Description GeneRatio BgRatio pvalue p.adjust
## syn00195 syn00195 Photosynthesis 8/29 63/953 0.0003317084 0.007961001
## qvalue
## syn00195 0.007681667
go_PSS_down_enrich <- go_functional_enrichment_locusList(locus_tags_down, CDS_features$locus_tag, write=TRUE, path="output/enrichment/GO_PSS_TV_universeAllCDS.csv")
## ID Description GeneRatio BgRatio
## GO:0042651 GO:0042651 thylakoid membrane 12/52 96/2593
## GO:0015979 GO:0015979 photosynthesis 11/52 86/2593
## GO:0018298 GO:0018298 protein-chromophore linkage 7/52 31/2593
## GO:0016168 GO:0016168 chlorophyll binding 4/52 10/2593
## GO:0030089 GO:0030089 phycobilisome 4/52 24/2593
## pvalue p.adjust qvalue
## GO:0042651 2.016019e-07 1.632976e-05 1.443046e-05
## GO:0015979 5.573214e-07 2.257152e-05 1.994624e-05
## GO:0018298 1.568669e-06 4.235406e-05 3.742789e-05
## GO:0016168 2.765964e-05 5.601076e-04 4.949619e-04
## GO:0030089 1.136353e-03 1.840891e-02 1.626779e-02
## geneID
## GO:0042651 slr1311/sll1814/slr1834/slr1835/sll0851/slr1513/slr1330/sml0008/slr0823/slr1281/ssr2831/sll0199
## GO:0015979 sll1214/sll1184/slr2051/sll1578/slr1834/slr1835/slr1986/sml0008/slr0823/sll1091/ssr2831
## GO:0018298 slr1311/sll1578/slr1834/slr1835/sll0851/slr1986/slr1963
## GO:0016168 slr1311/slr1834/slr1835/sll0851
## GO:0030089 slr2051/sll1578/slr1986/slr1963
## Count
## GO:0042651 12
## GO:0015979 11
## GO:0018298 7
## GO:0016168 4
## GO:0030089 4
# compared to universe: Where are PSS generally localized / what was detectable
kegg_PSS_down_enrich_2 <- kegg_functional_enrichment_locusList(locus_tags_down, locus_tags_all_PSS, write=TRUE, path="output/enrichment/KEGG_PSS_TV_universeDetectedCDS.csv")
## [1] ID Description GeneRatio BgRatio pvalue p.adjust
## [7] qvalue
## <0 rows> (or 0-length row.names)
go_PSS_down_enrich_2 <- go_functional_enrichment_locusList(locus_tags_down, locus_tags_all_PSS, write=TRUE, path="output/enrichment/GO_PSS_TV_universeDetectedCDS.csv")
## ID Description GeneRatio BgRatio
## GO:0015979 GO:0015979 photosynthesis 11/52 45/724
## GO:0042651 GO:0042651 thylakoid membrane 12/52 58/724
## GO:0018298 GO:0018298 protein-chromophore linkage 7/52 22/724
## pvalue p.adjust qvalue
## GO:0015979 0.0001423478 0.008256170 0.007941506
## GO:0042651 0.0003811497 0.009393589 0.009035575
## GO:0018298 0.0004858753 0.009393589 0.009035575
## geneID
## GO:0015979 sll1214/sll1184/slr2051/sll1578/slr1834/slr1835/slr1986/sml0008/slr0823/sll1091/ssr2831
## GO:0042651 slr1311/sll1814/slr1834/slr1835/sll0851/slr1513/slr1330/sml0008/slr0823/slr1281/ssr2831/sll0199
## GO:0018298 slr1311/sll1578/slr1834/slr1835/sll0851/slr1986/slr1963
## Count
## GO:0015979 11
## GO:0042651 12
## GO:0018298 7
p <- dotplot(go_PSS_down_enrich_2)
## wrong orderBy parameter; set to default `orderBy = "x"`
p
ggsave("output/enrichment/GO_dotplot_PSS_rneTV.pdf", plot=p, width=20, height=14, units="cm")
count_overlaps_xNt_upDownstream <- function(Ranges_object, Ranges_object_2, sequence_object, start_stop="start", len=25){
# function returns df with total number overlapping between Ranges_object and Ranges_object_2, iterating from start or stop codon (choose by setting start_stop to "start" or "stop") of Ranges_object +/- nucleotide positions.
# prepare df to return
df_overlaps <- data.frame(rbind(rep("NA", length((-len):(+len)))))
names(df_overlaps) <- (-len):(+len)
# extract lengths of sequences (needed for catching cases in which start_end -len or +len exceeds chromosome/plasmid length) -> if this is the case: since circular DNA: start from other end with counting again (see below)
lengths <- c()
for(i in 1:length(sequence_object)){
lengths[i] <- width(sequence_object[i])
}
names(lengths) <- names(sequence_object)
# extract info about what to extract (depending on strength: upstream corresponds to either + or -)
strands <- data.frame(Ranges_object)[,5]
plus_strand <- which(strand(Ranges_object)=="+")
minus_strand <- which(strand(Ranges_object)=="-")
seqs <- as.character(data.frame(Ranges_object)[,1])
for(d in (-len):(+len)){
positions <- c()
# get positions of start_end on plus and minus strand
if(start_stop=="start"){
positions[plus_strand] <- start(Ranges_object[plus_strand])-(-d) # e.g. - (- -25) -> corresponds to -25
positions[minus_strand] <- end(Ranges_object[minus_strand])+(-d) # e.g. + (- -25) -> corresponds to +25
}else{
positions[plus_strand] <- end(Ranges_object[plus_strand])-(-d) # e.g. - (- -25) -> corresponds to -25
positions[minus_strand] <- start(Ranges_object[minus_strand])+(-d) # e.g. + (- -25) -> corresponds to +25
}
# positions < 0
indices <- which(positions < 0)
positions[indices] <- lengths[seqs[indices]]-(-positions[indices])
# positions at which positions > length of respective plasmid / chromosome
indices <- which(positions>lengths[seqs])
positions[indices] <- positions[indices]-lengths[seqs[indices]]
# create GRanges object
temp_GRanges <- GRanges(seqnames=seqs, ranges=IRanges(start = positions, end=positions), strand = strands)
# count overlaps
overlap_vect <- countOverlaps(temp_GRanges, Ranges_object_2)
df_overlaps[,as.character(d)] <- sum(overlap_vect)
}
return(df_overlaps)
}
overlaps_PSS_up_start_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_WT, syne_fasta, len = 300, start_stop = "start")
overlaps_PSS_up_stop_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_WT, syne_fasta, len = 300, start_stop = "stop")
overlaps_PSS_up_start_CDS_ggplot <- data.frame(t(overlaps_PSS_up_start_CDS))
overlaps_PSS_up_start_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_up_start_CDS_ggplot))
names(overlaps_PSS_up_start_CDS_ggplot)[1] <- "count"
overlaps_PSS_up_stop_CDS_ggplot <- data.frame(t(overlaps_PSS_up_stop_CDS))
overlaps_PSS_up_stop_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_up_stop_CDS_ggplot))
names(overlaps_PSS_up_stop_CDS_ggplot)[1] <- "count"
p <- ggplot(overlaps_PSS_up_start_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#005a96ff") + theme_light() + labs(x="Distance relative to start codon (nt)", y="Number of cleavage sites")+ geom_vline(aes(xintercept=0), linetype=2, color="darkgray")# + ylim(0,8)
p
ggsave("output/distance_startStop/PSS_rneWT_count_relativeStartCodon.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(overlaps_PSS_up_stop_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#005a96ff") + theme_light() + labs(x="Distance relative to stop codon (nt)", y="Number of cleavage sites") + geom_vline(aes(xintercept=0), linetype=2, color="darkgray")# + ylim(0,8)
p
ggsave("output/distance_startStop/PSS_rneWT_count_relativeStopCodon.pdf", plot=p, width=10, height=7, units="cm")
overlaps_PSS_down_start_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_TV, syne_fasta, len = 300, start_stop = "start")
overlaps_PSS_down_stop_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_TV, syne_fasta, len = 300, start_stop = "stop")
overlaps_PSS_down_start_CDS_ggplot <- data.frame(t(overlaps_PSS_down_start_CDS))
overlaps_PSS_down_start_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_down_start_CDS_ggplot))
names(overlaps_PSS_down_start_CDS_ggplot)[1] <- "count"
overlaps_PSS_down_stop_CDS_ggplot <- data.frame(t(overlaps_PSS_down_stop_CDS))
overlaps_PSS_down_stop_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_down_stop_CDS_ggplot))
names(overlaps_PSS_down_stop_CDS_ggplot)[1] <- "count"
p <- ggplot(overlaps_PSS_down_start_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#e69f00ff") + theme_light() + labs(x="Distance relative to start codon (nt)", y="Number of cleavage sites")+ geom_vline(aes(xintercept=0), linetype=2, color="darkgray")# + ylim(0,8)
p
ggsave("output/distance_startStop/PSS_TV_count_relativeStartCodon.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(overlaps_PSS_down_stop_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#e69f00ff") + theme_light() + labs(x="Distance relative to stop codon (nt)", y="Number of cleavage sites")+ geom_vline(aes(xintercept=0), linetype=2, color="darkgray")# + ylim(0,8)
p
ggsave("output/distance_startStop/PSS_TV_count_relativeStopCodon.pdf", plot=p, width=10, height=7, units="cm")
To investigate secondary structures around processing sites: Extract 150nt upstream / downstream of processing sites and calculate minimum free energy (\(\Delta\)G). Do this also for random peaks, start and end positions of TUs, start and end positions of CDS. Use sliding window approach to reduce problem of transcript boundaries.
# export sequences from peaks + / - 150nt
peaks_WT_150 <- extract_RNA_sequences(PSS_Ranges_WT, syne_fasta, 150)
peaks_TV_150 <- extract_RNA_sequences(PSS_Ranges_TV, syne_fasta, 150)
writeXStringSet(peaks_WT_150, "output/RNAfold/peaks_WT_150.fasta", append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(peaks_TV_150, "output/RNAfold/peaks_TV_150.fasta", append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_WT_150.fasta RNAfold/mfe_files/mfe_peaks_WT_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_TV_150.fasta RNAfold/mfe_files/mfe_peaks_TV_150.tsv 50
mfe_PSS_up <- read.delim("output/RNAfold/mfe_files/mfe_peaks_WT_150.tsv", header=TRUE, row.names=1)
mfe_PSS_up_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_WT_150_control.tsv", header=TRUE, row.names=1)
mfe_PSS_down <- read.delim("output/RNAfold/mfe_files/mfe_peaks_TV_150.tsv", header=TRUE, row.names=1)
mfe_PSS_down_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_TV_150_control.tsv", header=TRUE, row.names=1)
Original sequences represented nucleotide positions before (-150:-1) and after (+1:+150) cleavage position. With the floating window approach etc., this translates to position “150.5” (index 126).
df_mfe_PSS_quartiles <- data.frame(cbind(ntpos=c(rep(c(-125:125),2)), mfe_distrib=c(apply(mfe_PSS_up, 1, median), apply(mfe_PSS_down, 1, median),apply(mfe_PSS_up, 1, quantile,probs=0.25), apply(mfe_PSS_down, 1, quantile,probs=0.25),apply(mfe_PSS_up, 1, quantile,probs=0.75), apply(mfe_PSS_down, 1, quantile,probs=0.75))))
df_mfe_PSS_quartiles$updown <- rep(c(rep("up",251), rep("down", 251)),3)
df_mfe_PSS_quartiles$median_quartile <- c(rep("median", 502), rep("1quartile", 502), rep("3quartile",502))
mean_or_median = mean
df_mfe_PSS <- data.frame(cbind(ntpos=c(rep(c(-125:125),4)), mfe=c(apply(mfe_PSS_up, 1, mean_or_median), apply(mfe_PSS_down, 1, mean_or_median), apply(mfe_PSS_up_control, 1, mean_or_median), apply(mfe_PSS_down_control,1, mean_or_median))))
df_mfe_PSS$updown <- rep(c(rep("up",251), rep("down", 251)),2)
df_mfe_PSS$control <- c(rep("data", 502), rep("control", 502))
Plot with shuffled data and non-shuffled data
p <- ggplot(data=df_mfe_PSS, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)5ps", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p
ggsave("output/RNAfold/figures/deltaG_PSS_shuffled.pdf", plot=p, width=10, height=7, units="cm")
Plot with quartiles (1st, median, 3rd)
p <- ggplot(data=df_mfe_PSS_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() +
scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>5p", "rne(5p)>WT")) +
scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-14,-2)
p
ggsave("output/RNAfold/figures/deltaG_PSS_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_PSS, df_mfe_PSS$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>5p", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p
ggsave("output/RNAfold/figures/deltaG_PSS.pdf", plot=p, width=10, height=7, units="cm")
run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_WT_150.fasta RNAfold/mfe_files/mfe_peaks_WT_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_TV_150.fasta RNAfold/mfe_files/mfe_peaks_TV_150_25nt.tsv 25
mfe_PSS_25nt_up <- read.delim("output/RNAfold/mfe_files/mfe_peaks_WT_150_25nt.tsv", header=TRUE, row.names=1)
mfe_PSS_25nt_down <- read.delim("output/RNAfold/mfe_files/mfe_peaks_TV_150_25nt.tsv", header=TRUE, row.names=1)
mfe_PSS_25nt_up_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_WT_150_25nt_control.tsv", header=TRUE, row.names=1)
mfe_PSS_25nt_down_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_TV_150_25nt_control.tsv", header=TRUE, row.names=1)
Processing site at position 150.5. When x-axis is labeled from -137.5:137.5, this corresponds to -0
df_mfe_PSS_25nt_quartiles <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),2)), mfe_distrib=c(apply(mfe_PSS_25nt_up, 1, median), apply(mfe_PSS_25nt_down, 1, median),apply(mfe_PSS_25nt_up, 1, quantile,probs=0.25), apply(mfe_PSS_25nt_down, 1, quantile,probs=0.25),apply(mfe_PSS_25nt_up, 1, quantile,probs=0.75), apply(mfe_PSS_25nt_down, 1, quantile,probs=0.75))))
df_mfe_PSS_25nt_quartiles$updown <- rep(c(rep("up",276), rep("down", 276)),3)
df_mfe_PSS_25nt_quartiles$median_quartile <- c(rep("median", 552), rep("1quartile", 552), rep("3quartile",552))
mean_or_median = mean
df_mfe_PSS_25nt <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),4)), mfe=c(apply(mfe_PSS_25nt_up, 1, mean_or_median), apply(mfe_PSS_25nt_down, 1, mean_or_median), apply(mfe_PSS_25nt_up_control, 1, mean_or_median), apply(mfe_PSS_25nt_down_control,1, mean_or_median))))
df_mfe_PSS_25nt$updown <- rep(c(rep("up",276), rep("down", 276)),2)
df_mfe_PSS_25nt$control <- c(rep("data", 552), rep("control", 552))
p <- ggplot(data=df_mfe_PSS_25nt, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)", "rne(5p)")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p
ggsave("output/RNAfold/figures/deltaG_PSS_25nt_shuffled.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=df_mfe_PSS_25nt_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() +
scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)", "rne(5p)")) +
scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-5.1,0.1)
p
ggsave("output/RNAfold/figures/deltaG_PSS_25nt_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_PSS_25nt, df_mfe_PSS_25nt$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)", "rne(5p)")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p
ggsave("output/RNAfold/figures/deltaG_PSS_25nt.pdf", plot=p, width=10, height=7, units="cm")
# use same amount of random positions
shuffle_Ranges_perPlasmid <- function(Ranges_object, sequence_object, seed=NULL){
set.seed(seed)
# extract lengths of sequences
lengths <- c()
for(i in 1:length(sequence_object)){
lengths[i] <- width(sequence_object[i])
}
names(lengths) <- names(sequence_object)
# initialise new_ranges, end has to be set according to length of plasmids / chromosome
new_ranges <- Ranges_object
start(new_ranges) <- rep(1, length(new_ranges))
# do actual shuffling
for(seq_i in 1:length(lengths)){
seq <- names(lengths)[seq_i]
indices <- which(seqnames(new_ranges)==seq)
# set ends to max possible length to avoid width < 0
end(new_ranges[indices]) <- rep(lengths[seq], length(new_ranges[indices]))
# create random numbers
start(new_ranges[indices]) <- round(runif(length(new_ranges[indices]), 1, lengths[seq]))
end(new_ranges[indices]) <- start(new_ranges[indices])
}
return(new_ranges)
}
random_up_Ranges <- shuffle_Ranges_perPlasmid(PSS_Ranges_WT, syne_fasta, 42)
random_down_Ranges <- shuffle_Ranges_perPlasmid(PSS_Ranges_TV, syne_fasta, 42)
random_up_150 <- extract_RNA_sequences(random_up_Ranges, syne_fasta, 150)
random_down_150 <- extract_RNA_sequences(random_down_Ranges, syne_fasta, 150)
writeXStringSet(random_up_150, "output/RNAfold/random_up_150.fasta", append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(random_down_150, "output/RNAfold/random_down_150.fasta", append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_up_150.fasta RNAfold/mfe_files/mfe_random_up_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_down_150.fasta RNAfold/mfe_files/mfe_random_down_150.tsv 50
mfe_random_up <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150.tsv", header=TRUE, row.names=1)
mfe_random_up_control <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150_control.tsv", header=TRUE, row.names=1)
mfe_random_down <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150.tsv", header=TRUE, row.names=1)
mfe_random_down_control <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150_control.tsv", header=TRUE, row.names=1)
df_mfe_random_quartiles <- data.frame(cbind(ntpos=c(rep(c(-125:125),2)), mfe_distrib=c(apply(mfe_random_up, 1, median), apply(mfe_random_down, 1, median),apply(mfe_random_up, 1, quantile,probs=0.25), apply(mfe_random_down, 1, quantile,probs=0.25),apply(mfe_random_up, 1, quantile,probs=0.75), apply(mfe_random_down, 1, quantile,probs=0.75))))
df_mfe_random_quartiles$updown <- rep(c(rep("up",251), rep("down", 251)),3)
df_mfe_random_quartiles$median_quartile <- c(rep("median", 502), rep("1quartile", 502), rep("3quartile",502))
mean_or_median = mean
df_mfe_random <- data.frame(cbind(ntpos=c(rep(c(-125:125),4)), mfe=c(apply(mfe_random_up, 1, mean_or_median), apply(mfe_random_down, 1, mean_or_median), apply(mfe_random_up_control, 1, mean_or_median), apply(mfe_random_down_control,1, mean_or_median))))
df_mfe_random$updown <- rep(c(rep("up",251), rep("down", 251)),2)
df_mfe_random$control <- c(rep("data", 502), rep("control", 502))
Plot with shuffled data and non-shuffled data
p <- ggplot(data=df_mfe_random, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p
ggsave("output/RNAfold/figures/deltaG_random_shuffled.pdf", plot=p, width=10, height=7, units="cm")
Plot with quartiles (1st, median, 3rd)
p <- ggplot(data=df_mfe_random_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() +
scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) +
scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-14,-2)
p
ggsave("output/RNAfold/figures/deltaG_random_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_random, df_mfe_PSS$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p
ggsave("output/RNAfold/figures/deltaG_random.pdf", plot=p, width=10, height=7, units="cm")
run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_up_150.fasta RNAfold/mfe_files/mfe_random_up_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_down_150.fasta RNAfold/mfe_files/mfe_random_down_150_25nt.tsv 25
mfe_random_25nt_up <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150_25nt.tsv", header=TRUE, row.names=1)
mfe_random_25nt_down <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150_25nt.tsv", header=TRUE, row.names=1)
mfe_random_25nt_up_control <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150_25nt_control.tsv", header=TRUE, row.names=1)
mfe_random_25nt_down_control <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150_25nt_control.tsv", header=TRUE, row.names=1)
Processing site at position 150.5. When x-axis is labeled from -137.5:137.5, this corresponds to -0
df_mfe_random_25nt_quartiles <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),2)), mfe_distrib=c(apply(mfe_random_25nt_up, 1, median), apply(mfe_random_25nt_down, 1, median),apply(mfe_random_25nt_up, 1, quantile,probs=0.25), apply(mfe_random_25nt_down, 1, quantile,probs=0.25),apply(mfe_random_25nt_up, 1, quantile,probs=0.75), apply(mfe_random_25nt_down, 1, quantile,probs=0.75))))
df_mfe_random_25nt_quartiles$updown <- rep(c(rep("up",276), rep("down", 276)),3)
df_mfe_random_25nt_quartiles$median_quartile <- c(rep("median", 552), rep("1quartile", 552), rep("3quartile",552))
mean_or_median = mean
df_mfe_random_25nt <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),4)), mfe=c(apply(mfe_random_25nt_up, 1, mean_or_median), apply(mfe_random_25nt_down, 1, mean_or_median), apply(mfe_random_25nt_up_control, 1, mean_or_median), apply(mfe_random_25nt_down_control,1, mean_or_median))))
df_mfe_random_25nt$updown <- rep(c(rep("up",276), rep("down", 276)),2)
df_mfe_random_25nt$control <- c(rep("data", 552), rep("control", 552))
p <- ggplot(data=df_mfe_random_25nt, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p
ggsave("output/RNAfold/figures/deltaG_random_25nt_shuffled.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=df_mfe_random_25nt_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() +
scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) +
scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-5.1,0.1)
p
ggsave("output/RNAfold/figures/deltaG_random_25nt_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_random_25nt, df_mfe_random_25nt$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p
ggsave("output/RNAfold/figures/deltaG_random_25nt.pdf", plot=p, width=10, height=7, units="cm")
Check for nucleotide content around processing sites, compare between different sets of processing sites.
length_up_down <- 50
peaks_all <- extract_RNA_sequences(PSS_Ranges, syne_fasta, length_up_down)
peaks_up <- extract_RNA_sequences(PSS_Ranges_WT, syne_fasta, length_up_down)
peaks_down <- extract_RNA_sequences(PSS_Ranges_TV, syne_fasta, length_up_down)
cons_mat <- consensusMatrix(peaks_up)
perc_AU_up <- (cons_mat[1,]+cons_mat[4,])/apply(cons_mat[1:4,], 2,sum)
df_perc_ACGU_up <- data.frame(cons_mat[1:4,])/sum(cons_mat[,1])*100
df_perc_ACGU_up <- data.frame(cbind(ntpos=c(-length_up_down:-1,1:length_up_down), t(df_perc_ACGU_up)))
df_perc_AU_up <- df_perc_ACGU_up[,c(1,2,5)]
df_perc_CG_up <- df_perc_ACGU_up[,c(1,3,4)]
df_perc_ACGU_up <- pivot_longer(df_perc_ACGU_up, 2:5)
df_perc_AU_up <- pivot_longer(df_perc_AU_up, 2:3)
df_perc_CG_up <- pivot_longer(df_perc_CG_up, 2:3)
cons_mat <- consensusMatrix(peaks_down)
perc_AU_down <- (cons_mat[1,]+cons_mat[4,])/apply(cons_mat[1:4,], 2,sum)
df_perc_ACGU_down <- data.frame(cons_mat[1:4,])/sum(cons_mat[,1])*100
df_perc_ACGU_down <- data.frame(cbind(ntpos=c(-length_up_down:-1,1:length_up_down), t(df_perc_ACGU_down)))
df_perc_AU_down <- df_perc_ACGU_down[,c(1,2,5)]
df_perc_CG_down <- df_perc_ACGU_down[,c(1,3,4)]
df_perc_ACGU_down <- pivot_longer(df_perc_ACGU_down, 2:5)
df_perc_AU_down <- pivot_longer(df_perc_AU_down, 2:3)
df_perc_CG_down <- pivot_longer(df_perc_CG_down, 2:3)
df_percAU <- data.frame(cbind(ntpos=rep(c(-length_up_down:-1,1:length_up_down),2),percAU=c(perc_AU_up, perc_AU_down)*100))
df_percAU$updown <- c(rep("up",2*length_up_down), rep("down", 2*length_up_down))
general_AU <- (sum(oligonucleotideFrequency(syne_fasta, width=1)[,c(1,4)])/sum(oligonucleotideFrequency(syne_fasta, width=1)))*100
ACGU_colors <- palette_OkabeIto[c(3,1,5,6)]
names(ACGU_colors) <- c("U", "G", "C", "A")
Compare AU content between peak positions which are either up- or downregulated
p <- ggplot(data=df_percAU, aes(x=ntpos, y=percAU, color=updown)) + geom_hline(aes(yintercept=general_AU), color="darkgray", linetype=2) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=.5) + theme_light() +
scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name ="Peaks",
breaks=c("up", "down"),
labels=c("rne(WT)>rne(5p)", "rne(5p)>rne(WT)")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab("Percentage AU (%)")
p <- p + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5))
ggsave("output/nt_content_plots/AUcontent.pdf", plot=p, width=10, height=7, units="cm")
Code for disinguishing nts:
p <- ggplot(data=df_perc_ACGU_up, aes(x=ntpos, y=value, color=name)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_color_manual(name="nt", values=ACGU_colors)+ xlab("Distance Relative to Cleavage Site (nt)") + ylab("Percentage ACGU (%)") + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(5,72)+xlim(-25,+25)
p
## Warning: Removed 200 row(s) containing missing values (geom_path).
ggsave("output/nt_content_plots/ACGU_up_content.pdf", plot=p, width=10, height=7, units="cm")
## Warning: Removed 200 row(s) containing missing values (geom_path).
p <- ggplot(data=df_perc_ACGU_down, aes(x=ntpos, y=value, color=name)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=.5) + theme_light() + scale_color_manual(name="nt", values=ACGU_colors)+ xlab("Distance Relative to Cleavage Site (nt)") + ylab("Percentage ACGU (%)") + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5))+ylim(5,72)+xlim(-25,+25)
p
## Warning: Removed 200 row(s) containing missing values (geom_path).
ggsave("output/nt_content_plots/ACGU_down_content.pdf", plot=p, width=10, height=7, units="cm")
## Warning: Removed 200 row(s) containing missing values (geom_path).
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 11 (bullseye)
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
##
## locale:
## [1] LC_CTYPE=de_DE.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=de_DE.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=de_DE.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] clusterProfiler_3.18.1 colorblindr_0.1.0
## [3] colorspace_2.0-0 viridis_0.5.1
## [5] viridisLite_0.3.0 rtracklayer_1.50.0
## [7] Biostrings_2.58.0 XVector_0.30.0
## [9] vsn_3.58.0 RColorBrewer_1.1-2
## [11] pheatmap_1.0.12 ggpubr_0.4.0
## [13] ggrepel_0.9.1 edgeR_3.32.1
## [15] limma_3.46.0 DESeq2_1.30.1
## [17] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [19] MatrixGenerics_1.2.1 matrixStats_0.58.0
## [21] GenomicRanges_1.42.0 GenomeInfoDb_1.26.2
## [23] IRanges_2.24.1 S4Vectors_0.28.1
## [25] BiocGenerics_0.36.0 forcats_0.5.1
## [27] stringr_1.4.0 dplyr_1.0.5
## [29] purrr_0.3.4 readr_1.4.0
## [31] tidyr_1.1.3 tibble_3.1.0
## [33] ggplot2_3.3.3 tidyverse_1.3.0
## [35] knitr_1.31
##
## loaded via a namespace (and not attached):
## [1] shadowtext_0.0.7 readxl_1.3.1 backports_1.2.1
## [4] fastmatch_1.1-0 igraph_1.2.6 plyr_1.8.6
## [7] splines_4.0.4 BiocParallel_1.24.1 digest_0.6.27
## [10] htmltools_0.5.1.1 GOSemSim_2.16.1 GO.db_3.12.1
## [13] fansi_0.4.2 magrittr_2.0.1 memoise_2.0.0
## [16] openxlsx_4.2.3 graphlayouts_0.7.1 annotate_1.68.0
## [19] modelr_0.1.8 enrichplot_1.10.2 blob_1.2.1
## [22] rvest_1.0.0 haven_2.3.1 xfun_0.22
## [25] crayon_1.4.1 RCurl_1.98-1.2 jsonlite_1.7.2
## [28] scatterpie_0.1.5 genefilter_1.72.1 survival_3.2-7
## [31] glue_1.4.2 polyclip_1.10-0 gtable_0.3.0
## [34] zlibbioc_1.36.0 DelayedArray_0.16.2 car_3.0-10
## [37] abind_1.4-5 scales_1.1.1 DOSE_3.16.0
## [40] DBI_1.1.1 rstatix_0.7.0 Rcpp_1.0.6
## [43] xtable_1.8-4 foreign_0.8-81 bit_4.0.4
## [46] preprocessCore_1.52.1 httr_1.4.2 fgsea_1.16.0
## [49] ellipsis_0.3.1 farver_2.1.0 pkgconfig_2.0.3
## [52] XML_3.99-0.5 sass_0.3.1 dbplyr_2.1.0
## [55] locfit_1.5-9.4 utf8_1.1.4 labeling_0.4.2
## [58] tidyselect_1.1.0 rlang_0.4.10 reshape2_1.4.4
## [61] AnnotationDbi_1.52.0 munsell_0.5.0 cellranger_1.1.0
## [64] tools_4.0.4 cachem_1.0.4 downloader_0.4
## [67] cli_2.3.1 generics_0.1.0 RSQLite_2.2.3
## [70] broom_0.7.5 evaluate_0.14 fastmap_1.1.0
## [73] yaml_2.2.1 bit64_4.0.5 fs_1.5.0
## [76] tidygraph_1.2.0 zip_2.1.1 ggraph_2.0.5
## [79] DO.db_2.9 xml2_1.3.2 compiler_4.0.4
## [82] rstudioapi_0.13 curl_4.3 affyio_1.60.0
## [85] ggsignif_0.6.1 reprex_1.0.0 tweenr_1.0.1
## [88] geneplotter_1.68.0 bslib_0.2.4 stringi_1.5.3
## [91] highr_0.8 lattice_0.20-41 Matrix_1.3-2
## [94] vctrs_0.3.6 pillar_1.5.1 lifecycle_1.0.0
## [97] BiocManager_1.30.10 jquerylib_0.1.3 cowplot_1.1.1
## [100] data.table_1.14.0 bitops_1.0-6 qvalue_2.22.0
## [103] R6_2.5.0 affy_1.68.0 gridExtra_2.3
## [106] rio_0.5.26 MASS_7.3-53.1 assertthat_0.2.1
## [109] withr_2.4.1 GenomicAlignments_1.26.0 Rsamtools_2.6.0
## [112] GenomeInfoDbData_1.2.4 hms_1.0.0 grid_4.0.4
## [115] rvcheck_0.1.8 rmarkdown_2.7 carData_3.0-4
## [118] ggforce_0.3.3 lubridate_1.7.10
Innocenti, Nicolas, Monica Golumbeanu, Aymeric Fouquier d’Hérouël, Caroline Lacoux, Rémy A. Bonnin, Sean P. Kennedy, Françoise Wessner, et al. 2015. “Whole-Genome Mapping of 5’ Rna Ends in Bacteria by Tagged Sequencing: A Comprehensive View in Enterococcus Faecalis.” RNA 21 (5): 1018–30. https://doi.org/10.1261/rna.048470.114.
Kopf, M., S. Klähn, I. Scholz, J. K. F. Matthiessen, W. R. Hess, and B. Voss. 2014. “Comparative Analysis of the Primary Transcriptome of Synechocystis Sp. PCC 6803.” DNA Research 21 (5): 527–39. https://doi.org/10.1093/dnares/dsu018.
Sakurai, Isamu, Damir Stazic, Marion Eisenhut, Eerika Vuorio, Claudia Steglich, Wolfgang R. Hess, and Eva-Mari Aro. 2012. “Positive Regulation of psbA Gene Expression by Cis-Encoded Antisense RNAs in Synechocystis Sp. PCC 6803.” Plant Physiology 160 (2): 1000–1010. https://doi.org/10.1104/pp.112.202127.